#### Importing all the required **Python** and **R** libraries 

In [None]:
import os
import warnings

warnings.filterwarnings("ignore")
import pathlib
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from scipy.stats import median_abs_deviation


import scranPY
%load_ext autoreload
%autoreload 2
#%load_ext lab_black

#### Scanpy settings

In [None]:
sc.logging.print_header()
sc.set_figure_params(facecolor="white", figsize=(8, 8), dpi_save=600)
sc.settings.verbosity = 1

## Reading data

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/02_mouse_larry_barcoded_integrated_scvi.h5ad')

## QC

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("mt-")
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))
adata.var["hb"] = adata.var_names.str.contains('Hb')

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_mt", 5)
    | is_outlier(adata, "pct_counts_ribo", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
    | (adata.obs['total_counts'] > 100000)
    #| (adata.obs['n_genes_by_counts'] < 500)
)
adata.obs.outlier.value_counts()

In [None]:
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="outlier")

In [None]:
sc.pl.umap(adata, color=["outlier", "n_genes_by_counts", "total_counts", 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], ncols=1)

In [None]:
adata = adata[~adata.obs["outlier"]].copy()

## normalization

In [None]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers["log_norm"] = adata.X.copy()
adata.X = adata.layers["counts"].copy()

## annotation

#### Level 0

In [None]:
sc.tl.leiden(adata, resolution=0.5, key_added="leiden_0.5")

In [None]:
(adata.obs['leiden_0.5'].value_counts() > adata.shape[0] * 0.01).value_counts()

In [None]:
adata.obs['leiden_0.5'].value_counts().head(25)

In [None]:
adata[adata.obs['leiden_0.5'].isin([str(i) for i in range(21)])], adata

In [None]:
adata = adata[adata.obs['leiden_0.5'].isin([str(i) for i in range(21)])].copy()

In [None]:
sc.pl.umap(adata, color='leiden_0.5', ncols=1, legend_loc='on data')

In [None]:
markers_level_0  = {
    "Adipocyte": ["Pparg", "Cebpa", "Adipoq", "Fabp4", "Lpl"],
    "Fibroblasts": ["Col1a1", "Col1a2", "Pdgfra", "Dcn", "Fap"],
    "Endocrine pancreas": ["Neurog3", "Ins2", "Gcg", "Sst", "Ppy"],
    "Endothelial": ["Pecam1", "Cdh5", "Vwf", "Kdr", "Nos3"],
    "Smooth muscle cells": ["Acta2", "Myh11", "Tagln", "Cnn1", "Des"],
    "Pericyte": ["Pdgfrb", "Rgs5", "Anpep", "Abcc9", "Cspg4"],
    "Schwann": ["Mpz", "Mbp", "Plp1", "S100b", "Gfap"],
    "Epithelial cells": ["Epcam", "Krt18", "Krt8", "Krt19", "Cldn3", 'Cdh2', 'Avil',],
    "Lymphoid cells": ['Ptprc', 'Cd79a',"Cd3e", "Cd4", "Cd8a", "Nkg7", "Il7r"],
    "Myeloid cells": ['Ptprc',"Cd68", "Itgam", "Lyz2", "Csf1r", "Adgre1", 'S100a8', 'S100a9'],
}


In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_0.5', layer='log_norm')
sc.tl.dendrogram(adata, groupby='leiden_0.5', use_rep='X_scVI')
sc.pl.rank_genes_groups_dotplot(adata, n_genes=10, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=True)

In [None]:
sc.pl.dotplot(adata, groupby='leiden_0.5', var_names=markers_level_0, layer='log_norm', dendrogram=True)

In [None]:
anno = {
    "0": "Myeloid Cell",
    "1": "Epithelial Cell",
    "2": "Lymphoid Cell",
    "3": "Epithelial Cell",
    "4": "Myeloid Cell",
    "5": "Myeloid Cell",
    "6": "Myeloid Cell",
    "7": "Lymphoid Cell",
    "8": "Fibroblast",
    "9": "Myeloid Cell",
    "10": "Myeloid Cell",
    "11": "Epithelial Cell",
    "12": "Epithelial Cell",
    "13": "Epithelial Cell",
    "14": "Lymphoid Cell",
    "15": "Lymphoid Cell",
    "16": "Myeloid Cell",
    "17": "Myeloid Cell",
    "18": "Endothelial Cell",
    "19": "Lymphoid Cell",
    "20": "Epithelial Cell",
}

In [None]:
adata.obs['Level_0'] = adata.obs['leiden_0.5'].map(anno)

In [None]:
sc.pl.umap(adata, color = 'Level_0')

## Level 1

#### Myeloid cells

In [None]:
sc.tl.leiden(adata, resolution=0.25, restrict_to=('Level_0', ['Myeloid Cell']), key_added="leiden_myeloid_level_1")

In [None]:
sc.pl.umap(adata, color = ['Level_0','leiden_myeloid_level_1'], legend_loc='on data')

In [None]:
myeloid_subtype_markers = {
    "Neutrophil": ["S100a8", "S100a9", "Elane", "Mpo", "Ly6g"],
    "Macrophage": ["Cx3cr1","Adgre1", "Cd68", "Mrc1", "C1qa", "Aif1"],
    "Monocyte": ["Ly6c2", "Ccr2", "Cd14", "Fcgr1", "Trem1"],
    "Mast": ["Kit", "Mcpt4", "Cma1", "Tpsb2", "Hdc"],
    "Dendritic": ["Itgax", "Zbtb46", "Flt3", "Xcr1", "Cd209a"]
}


In [None]:
sc.pl.dotplot(adata, groupby='leiden_myeloid_level_1', var_names=myeloid_subtype_markers, layer='log_norm', dendrogram=False)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_myeloid_level_1', layer='log_norm', gropus = [f'Myeloid Cell,{i}' for i in range(9)])
sc.pl.rank_genes_groups_dotplot(adata, n_genes=10, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=False)

In [None]:
anno = {
    "Myeloid Cell,0": "Neutrophil",
    "Myeloid Cell,1": "Macrophage",
    "Myeloid Cell,2": "Monoctye",
    "Myeloid Cell,3": "Dendritic Cell",
    "Myeloid Cell,4": "Macrophage",  
    "Myeloid Cell,5": "Macrophage",
    "Myeloid Cell,6": "Macrophage",
    "Myeloid Cell,7": "Macrophage",
    "Myeloid Cell,8": "Mast Cell",
}

In [None]:
adata.obs['Level_1'] = adata.obs['leiden_myeloid_level_1'].replace(anno).astype('category')

In [None]:
adata.obs['Level_1']

#### lymphoid cells

In [None]:
sc.tl.leiden(adata, resolution=0.4, restrict_to=('Level_1', ['Lymphoid Cell']), key_added="leiden_lymphoid_level_1")

In [None]:
sc.pl.umap(adata, color = ['Level_1','leiden_lymphoid_level_1'], wspace=.4)

In [None]:
lymphoid_subtype_markers = {
    "B cell": ["Cd19", "Cd79a", "Ms4a1", "Cd22", "Ighm"],
    "Plasma cell": ["Ighg1", "Igha", "Sdc1", "Xbp1", "Prdm1"],
    "CD4 T cell": ["Cd3e", "Cd4", "Il7r", "Tcf7", "Lef1"],
    "CD8 T cell": ["Cd3e", "Cd8a", "Gzma", "Gzmb", "Ccl5"],
    "Treg": ["Foxp3", "Il2ra", "Ctla4", "Ikzf2", "Tnfrsf18"],
    "NK cell": ["Ncr1", "Nkg7", "Klrb1c", "Gzmb", "Prf1"]
}


In [None]:
sc.pl.dotplot(adata, groupby='leiden_lymphoid_level_1', var_names=lymphoid_subtype_markers, layer='log_norm', dendrogram=False)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_lymphoid_level_1', layer='log_norm', gropus = [f'Lymphoid Cell,{i}' for i in range(9)])
sc.pl.rank_genes_groups_dotplot(adata, n_genes=10, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=False)

In [None]:
anno = {
    "Lymphoid Cell,0": "Double Positive T Cell",
    "Lymphoid Cell,1": "CD4 T Cell",
    "Lymphoid Cell,2": "CD4 T Cell",
    "Lymphoid Cell,3": "B Cell",
    "Lymphoid Cell,4": "NK Cell",  
    "Lymphoid Cell,5": "Treg Cell",
    "Lymphoid Cell,6": "B Cell",
    "Lymphoid Cell,7": "Double Positive T Cell",
    "Lymphoid Cell,8": "Plasma Cell",
}

In [None]:
adata.obs['Level_1'] = adata.obs['leiden_lymphoid_level_1'].replace(anno)


In [None]:
adata.obs['Level_1']

#### fibroblasts

In [None]:
sc.tl.leiden(adata, resolution=0.25, restrict_to=('Level_1', ['Fibroblast']), key_added="leiden_fibroblast_level_1")

In [None]:
sc.pl.umap(adata, color = ['Level_1','leiden_fibroblast_level_1'], wspace=.4)

In [None]:
fibroblast_lineages = {
    "fibroblasts": ["Col1a1", "Col1a2", "Pdgfra", "Dcn", "Lum"],
    "CAF": ["Acta2", "Tagln", "Myh11", "Cnn1", "Myl9", "Il6", "Cxcl12", "Ccl2", "Lif", "Pdgfra", "Cd74", "H2-Aa", "H2-Ab1", "Spp1", "Ccl7"],
    "Pancreatic stellate cells (PSC)": ["Rgs5", "Des", "Acta2", "Lrat", "Pdgfrb"]
}


In [None]:
sc.pl.dotplot(adata, groupby='leiden_fibroblast_level_1', var_names=fibroblast_lineages, layer='log_norm', dendrogram=False)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_lymphoid_level_1', layer='log_norm', gropus = [f'Lymphoid Cell,{i}' for i in range(9)])
sc.pl.rank_genes_groups_dotplot(adata, n_genes=10, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=False)

In [None]:
adata.obs['Level_1'] = adata.obs['Level_1'].replace("Fibroblast","Cancer Associated Fibroblast")


#### Endothelial

In [None]:
sc.tl.leiden(adata, resolution=0.25, restrict_to=('Level_1', ['Endothelial Cell']), key_added="leiden_endothelial_level_1")

In [None]:
sc.pl.umap(adata, color = ['Level_1','leiden_endothelial_level_1'], wspace=.4)

In [None]:
endothelial_subtypes = {
    "Vascular endothelial cells": ["Pecam1", "Cdh5", "Kdr", "Eng", "Nos3"],
    "Lymphatic endothelial cells": ["Prox1", "Lyve1", "Pdpn", "Flt4", "Mmrn1"]
}


In [None]:
sc.pl.dotplot(adata, groupby='leiden_endothelial_level_1', var_names=endothelial_subtypes, layer='log_norm', dendrogram=False)

In [None]:
anno = {
    "Endothelial Cell,0": "Endothelial Vascular Cell",
    "Endothelial Cell,1": "Endothelial Vascular Cell",
    "Endothelial Cell,2": "Endothelial Lymphatic Cell",
    "Endothelial Cell,3": "Endothelial Vascular Cell",
}

In [None]:
adata.obs['Level_1'] = adata.obs['leiden_endothelial_level_1'].replace(anno)


#### Epithelial

In [None]:
sc.tl.leiden(adata, resolution=0.25, restrict_to=('Level_1', ['Epithelial Cell']), key_added="leiden_epithelial_level_1")

In [None]:
sc.pl.umap(adata, color = ['Level_1','leiden_epithelial_level_1'], wspace=.4)

In [None]:
barcodes_leiden = adata.obs[['leiden_epithelial_level_1', 'larry_positive']].copy()

In [None]:
(barcodes_leiden.groupby('leiden_epithelial_level_1')['larry_positive'].value_counts()/barcodes_leiden.groupby('leiden_epithelial_level_1')['larry_positive'].count() * 100).plot(kind='bar', figsize=(20, 5))

In [None]:
anno = {
    "Epithelial Cell,0": "Malignant Epithelial Cell",
    "Epithelial Cell,1": "Malignant Epithelial Cell",
    "Epithelial Cell,2": "Malignant Epithelial Cell",
    "Epithelial Cell,3": "Malignant Epithelial Cell",
    "Epithelial Cell,4": "Non-Malignant Epithelial Cell",
    "Epithelial Cell,5": "Malignant Epithelial Cell",
    "Epithelial Cell,6": "Malignant Epithelial Cell",
    "Epithelial Cell,7": "Non-Malignant Epithelial Cell",
}

In [None]:
adata.obs['Level_1'] = adata.obs['leiden_epithelial_level_1'].replace(anno)


In [None]:
sc.pl.umap(adata, color = 'Level_1')

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/03_mouse_larry_barcoded_annotated.h5ad')