In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn_ann.kneighbors.annoy import AnnoyTransformer

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/12_mouse_all_integration.h5ad')

In [None]:
adata

In [None]:
sc.pp.neighbors(adata, transformer=AnnoyTransformer(15), use_rep='scANVI_emb')

In [None]:
sc.tl.umap(adata)

## annotation

In [None]:
threshold = 0

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers['log_norm'] = adata.X.copy()

In [None]:
sc.pl.umap(adata, color = 'Level_1_refined')

In [None]:
adata.obs['Level_1_refined'].unique().tolist()

In [None]:
dictionary_maps = []

## T cells

In [None]:
T = adata[adata.obs['Level_1_refined'].isin(["CD8 T Cell", "Ambiguous T Cell", "Treg Cell"])].copy()

In [None]:
t_cell_markers = {
    "CD4+ T Cell": ["Cd4", "Il7r", "Tcf7", "Lck"],
    "CD8+ T Cells": ["Cd8a", "Cd8b1", "Gzma", "Gzmb"],
    "T-reg": ["Foxp3", "Il2ra", "Ctla4"],
}

In [None]:
for cell_type, markers in t_cell_markers.items():
    sc.tl.score_genes(T, gene_list=markers, score_name=cell_type)
celltypes = []
scores = T.obs[list(t_cell_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(t_cell_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

T.obs['celltype'] = celltypes


In [None]:
T.obs['celltype'].value_counts()

In [None]:
T.obs['celltype'].replace('Missclassifi', 'Missclassified', inplace=True)
sc.pl.dotplot(T, groupby = 'celltype', var_names = list(t_cell_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(T, groupby = 'celltype', var_names = t_cell_markers, vmax=1, vmin=0)

In [None]:
t_cells_map = {k:v for k,v in zip(list(T.obs_names), list(T.obs.celltype))}
dictionary_maps.append(t_cells_map)

## Neutrophils

In [None]:
N = adata[adata.obs['Level_1_refined'].isin(["Neutrophil"])].copy()

In [None]:
neutrophil_markers = {
    "Neutrophil - N0": [
        "S100a8", "Cxcr2", "Sell",
    ],
    "Neutrophil - N1": [
        "S100a8", "Tnf", "Stat1"
    ],
    "Neutrophil - N2": ["S100a8", "Vegfa", "Tgfb1", "Il10"],
}

In [None]:
for cell_type, markers in neutrophil_markers.items():
    sc.tl.score_genes(N, gene_list=markers, score_name=cell_type)
celltypes = []
scores = N.obs[list(neutrophil_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(neutrophil_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

N.obs['celltype'] = celltypes


In [None]:
sc.pl.dotplot(N,  groupby = 'celltype', var_names = list(neutrophil_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(N,  groupby = 'celltype', var_names = neutrophil_markers, vmax=1, vmin=0)

In [None]:
N.obs['celltype'].unique().tolist()

In [None]:
neutro_cells_map = {k:v for k,v in zip(list(N.obs_names), list(N.obs.celltype))}
dictionary_maps.append(neutro_cells_map)

## Dendritic

In [None]:
D = adata[adata.obs['Level_1_refined'].isin(["Dendritic Cell"])].copy()

In [None]:
dendritic_markers = {
    "Dendritic Cell - cDC1": ["Xcr1", "Clec9a", "Batf3", "Irf8"],
    "Dendritic Cell - cDC2": ["Itgax", "Sirpa", "Irf4"],
    "Dendritic Cell - pDC": ["Siglech", "Bst2", "Irf7", "Tcf4"],
}


In [None]:
for cell_type, markers in dendritic_markers.items():
    sc.tl.score_genes(D, gene_list=markers, score_name=cell_type)
celltypes = []
scores = D.obs[list(dendritic_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(dendritic_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

D.obs['celltype'] = celltypes


In [None]:
dendritic_markers

In [None]:
D.obs['celltype'].replace({'Miss': 'Missclassified'}, inplace=True)
sc.pl.dotplot(D,  groupby = 'celltype', var_names = list(dendritic_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(D,  groupby = 'celltype', var_names = dendritic_markers, vmax=1, vmin=0)

In [None]:
D.obs['celltype'].unique().tolist()

In [None]:
dendr_cells_map = {k:v for k,v in zip(list(D.obs_names), list(D.obs.celltype))}
dictionary_maps.append(dendr_cells_map)

## Macrophages

In [None]:
M = adata[adata.obs['Level_1_refined'].isin(["Macrophage"])].copy()

In [None]:
macrophage_markers = {
    "Macrophage - M1 TAM": [
        "Il1b", "Cd86",
    ],
    "Macrophage - M2 TAM": [
        "Icam1", "Stat1",
    ],
    "Macrophage - angiogenic TAM": [
        "Vcan", "Vegfa"
    ],
    "Macrophage - lipid processing TAM": [
        "Gpnmb"
    ],
    
}

In [None]:
for cell_type, markers in macrophage_markers.items():
    sc.tl.score_genes(M, gene_list=markers, score_name=cell_type)
celltypes = []
scores = M.obs[list(macrophage_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(macrophage_markers.keys()))[max_indices]

max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

M.obs['celltype'] = celltypes


In [None]:
sc.pl.dotplot(M, groupby = 'celltype', var_names = list(macrophage_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(M, groupby = 'celltype', var_names = macrophage_markers, vmax=1, vmin=0)

In [None]:
M.obs['celltype'].unique().tolist()

In [None]:
machrophage_cells_map = {k:v for k,v in zip(list(M.obs_names), list(M.obs.celltype))}
dictionary_maps.append(machrophage_cells_map)

## CAFs

In [None]:
CAF = adata[adata.obs['Level_1_refined'].isin(["Cancer Associated Fibroblast"])].copy()

In [None]:
caf_markers = {
    "Fibroblast": ["Pdgfra", "Pdgfrb", "S100a4", "Vim", "Pdpn", "Des"],
    "myCAF": ["Acta2", "Tagln", "Myl9", "Tgfb1", "Col1a1", "Col3a1"],
    "iCAF": ["Il6", "Lif", "Cxcl12", "Ccl2", "Pdgfa", "Il1a", "Il1b"],
    "apCAF": ["H2-Ab1", "Cd74", "H2-Aa"],
}

In [None]:
for cell_type, markers in caf_markers.items():
    sc.tl.score_genes(CAF, gene_list=markers, score_name=cell_type)
celltypes = []
scores = CAF.obs[list(caf_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(caf_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

CAF.obs['celltype'] = celltypes


In [None]:
CAF.obs['celltype'].replace({'Missclassi': 'Missclassified'}, inplace=True)
sc.pl.dotplot(CAF,  groupby = 'celltype', var_names = list(caf_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(CAF,  groupby = 'celltype', var_names = caf_markers, vmax=1, vmin=0)

In [None]:
CAF.obs['celltype'].unique().tolist()

In [None]:
caf_cells_map = {k:v for k,v in zip(list(CAF.obs_names), list(CAF.obs.celltype))}
dictionary_maps.append(caf_cells_map)

## Non malignant epithelial

In [None]:
NME = adata[adata.obs['Level_1_refined'].isin(["Non Malignant Epithelial Cell"])].copy()

In [None]:
nme_markers = {
    "Alpha Cell": ["Gcg", "Arx", "Mafb"],
    "Beta Cell": ["Ins1", "Ins2", "Pdx1", "Nkx6-1"],
    "Delta Cell": ["Sst", "Hhex", "Rbp4"],
    "Epsilon Cell": ["Ghrl", "Npy"],
    "Gamma Cell": ["Ppy", "Pax6"],
    "Ductal Cell": ["Krt19", "Hnf1b", "Cftr"],
    "ADM Cell": ["Sox9", "Hnf1b", "Muc6", "Tff2"],
    "Acinar Cell": ["Ptf1a", "Cela1", "Cpa1", "Prss1"],
    "Acinar (REG+) Cell": ["Reg3a", "Reg3b", "Reg1", "Reg2"],
    "Acinar Idling Cell": ["Nr5a2", "Rbpjl"]
}

In [None]:
for cell_type, markers in nme_markers.items():
    sc.tl.score_genes(NME, gene_list=markers, score_name=cell_type)
celltypes = []
scores = NME.obs[list(nme_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(nme_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

NME.obs['celltype'] = celltypes


In [None]:
NME.obs['celltype'].unique().tolist()

In [None]:
sc.pl.dotplot(NME,  groupby = 'celltype', var_names = list(nme_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(NME,  groupby = 'celltype', var_names = nme_markers, vmax=1, vmin=0)

In [None]:
nme_cells_map = {k:v for k,v in zip(list(NME.obs_names), list(NME.obs.celltype))}
dictionary_maps.append(nme_cells_map)

## Malignant epithelial

In [None]:
ME = adata[adata.obs['Level_1_refined'].isin(["Malignant Epithelial Cell"])].copy()

In [None]:
me_markers = {
    "Malignant Cell - Epithelial": [
        "Epcam",
        "Cdh1",
        "Krt18",
        "Krt8",
        "Krt19",
        "Muc1",
        "Ocln",
    ],
    "Malignant Cell - Mesenchymal": [
        "Vim",
        "Cdh2",
        "Fn1",
        "Zeb1",
        "Zeb2",
        "Snai1",
        "Snai2",
        "Twist1",
    ],
}

In [None]:
for cell_type, markers in me_markers.items():
    sc.tl.score_genes(ME, gene_list=markers, score_name=cell_type)
celltypes = []
scores = ME.obs[list(me_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(me_markers.keys()))[max_indices]

#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

ME.obs['celltype'] = celltypes


In [None]:
ME.obs['celltype'].unique().tolist()

In [None]:
ME.obs['celltype'].replace({'Missclass': 'Missclassified'}, inplace=True)
sc.pl.dotplot(ME,  groupby = 'celltype', var_names = list(me_markers.keys()), vmax=1, vmin=0)
sc.pl.dotplot(ME,  groupby = 'celltype', var_names = me_markers, vmax=1, vmin=0)

In [None]:
me_cells_map = {k:v for k,v in zip(list(ME.obs_names), list(ME.obs.celltype))}
dictionary_maps.append(me_cells_map)

## add to anndata

In [None]:
dictionary_maps_ = {k: v for d in dictionary_maps for k, v in d.items()}


In [None]:
anno_map = pd.Series(dictionary_maps_)

In [None]:
anno_map.replace({'Acinar (Reg +) Cell': 'Acinar (REG+) Cell'}, inplace=True)


In [None]:
adata.obs['Level_3'] = anno_map
adata.obs['Level_3'] = adata.obs['Level_3'].fillna(adata.obs['Level_1_refined'])

## inspect missclassified cells

In [None]:
adata.obs['Level_3'].replace('Cd8+ T Cells', 'CD8+ T Cell', inplace=True)

In [None]:
for cell in adata.obs['Level_3'].unique():
    sc.pl.umap(adata, color = 'Level_3', groups = [cell],  )

In [None]:
sc.pl.umap(adata, color = 'Level_1_refined',)

In [None]:
missclassified = adata[adata.obs['Level_3'] == 'Missclassified'].copy()
sc.pp.neighbors(missclassified, transformer=AnnoyTransformer(15), use_rep='scANVI_emb')
sc.tl.umap(missclassified)

In [None]:
sc.pl.umap(missclassified, color = 'Level_1_refined',)

In [None]:
sc.tl.leiden(missclassified, flavor='igraph', resolution=0.25)

In [None]:
sc.pl.umap(missclassified, color = ['leiden','Level_1_refined'], ncols=1)

In [None]:
sc.tl.dendrogram(missclassified, groupby='leiden', use_rep='scANVI_emb')

In [None]:
immune_markers = {**t_cell_markers, **neutrophil_markers, **dendritic_markers, **macrophage_markers}
immune_markers_coarse = {
    "T Cell": 'Cd3e',
    "Neutrophil": 'S100a9',
    'Macrophage': 'Cd68',
}
print('='*50)
sc.pl.dotplot(missclassified, groupby = 'leiden', var_names = immune_markers, dendrogram=False, categories_order=[str(i) for i in range(0, 12)])
sc.pl.dotplot(missclassified, groupby = 'leiden', var_names = immune_markers_coarse, dendrogram=False,categories_order=[str(i) for i in range(0, 12)])
sc.pl.dotplot(missclassified, groupby = 'leiden', var_names = me_markers, dendrogram=False, categories_order=[str(i) for i in range(0, 12)])
sc.pl.dotplot(missclassified, groupby = 'leiden', var_names = caf_markers, dendrogram=False, categories_order=[str(i) for i in range(0, 12)])

print('='*50)

In [None]:
anno = {
    "0":'myCAF',
    "1":'Macrophage - CD3+ TAM',
    "2":'Macrophage - M1 TAM',
    "3":'Macrophage - M1 TAM',
    "4":'Macrophage - M1 TAM',
    "5":'Macrophage - M1 TAM',
    "6":'CD4+ T Cell',
    "7":'Neutrophil - N0',
    "8":'CD4+ T Cell',
    "9":'Macrophage - M1 TAM',
    "10":'CD4+ T Cell',
    "11":'CD4+ T Cell',
}

In [None]:
missclassified.obs.Level_3 = missclassified.obs['leiden'].replace(anno)
anno_map = missclassified.obs['Level_3']


In [None]:
adata.obs['Level_3'] = adata.obs['Level_3'].astype(str)
adata.obs['Level_3'].loc[missclassified.obs.index] = anno_map
adata.obs['Level_3'] = adata.obs['Level_3'].astype('category')


In [None]:
adata.obs['Level_3'].value_counts()

In [None]:
adata.obs['Level_3'] = adata.obs['Level_3'].copy()
del adata.obs['Level_3']

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/13_mouse_all_annotated.h5ad')

In [None]:
import scanpy as sc
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/13_mouse_all_annotated.h5ad')

In [None]:
adata.obs['Level_3'].unique().tolist()


In [None]:
repl = {
    "Malignant Cell - Classical": "Malignant Cell - Epithelial",
    "Malignant Cell - Basal": "Malignant Cell - Mesenchymal",
}

In [None]:
adata.obs['Level_3'] = adata.obs['Level_3'].replace(repl)

In [None]:
adata.obs['Level_3'].unique().tolist()


In [None]:
sc.pl.dotplot(adata[adata.obs['Level_3'].str.contains('Malignant Cell')], groupby = 'Level_3', var_names = me_markers, )

In [None]:
sc.pl.heatmap(adata[adata.obs['Level_3'].str.contains('Malignant Cell')], groupby = 'Level_3', var_names = me_markers, )

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/13_mouse_all_annotated.h5ad')