#### Importing all the required **Python** and **R** libraries 

In [None]:
import pandas as pd
import scanpy as sc
import warnings
import scarches as sca
warnings.filterwarnings("ignore")

import decoupler as dc

import sys
sys.path.append('../scripts')
%load_ext autoreload
%autoreload 2
#%load_ext 
from sklearn_ann.kneighbors.annoy import AnnoyTransformer

In [None]:
sc.set_figure_params(frameon=False)
sc.settings.figdir = '/home/daniele/Code/scmouse_atlas/reports/figures/'

#### Read

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/10_mouse_all_integrated_scanvi.h5ad')

In [None]:
adata.obs_names_make_unique()

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_scANVI', transformer=AnnoyTransformer(15))

In [None]:
sc.tl.leiden(adata, flavor = 'igraph', resolution = .8)

In [None]:
adata

In [None]:
sc.tl.umap(adata, min_dist = .1)

In [None]:
sc.pl.umap(adata, color = 'Level_1_label_transfer', layer = 'log_norm')

In [None]:
sc.pl.umap(adata[~adata.obs.leiden.isin(['27','29', '30', '31'])], color = 'leiden', layer = 'log_norm', legend_loc = 'on data')

In [None]:
adata = adata[~adata.obs.leiden.isin(['27','29', '30', '31'])].copy()

In [None]:
sc.pl.umap(adata, color = 'provenance')

In [None]:
sc.pl.umap(adata, color = 'larry_positive', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
level_1_markers = {
    "Adipocyte": ["Plin1", "Lpl"],
    "Pancreatic stellate cells": ["Des"],
    "Cancer associated fibroblasts": ["Col6a3", "Col1a1", "Thbs2", "Fn1", "Pdpn", "Dcn", "Vim", "Fap"],
    "Endothelial": ["Pecam1", "Vwf"],
    "Intra-pancreatic neurons": ["Eno2", "Chat", "Th"],
    "Pericyte/Vascular Smooth Muscle": ["Pdgfrb", "Dlk1", "Rgs5", "Cspg4", "Mcam",],
    "Schwann": ["Sox10", "S100b"],
    "Endocrine": ["Chga", "Syp"],
    "Malignant epithelial cells": ["Krt7", "Krt17", "Krt19", "Epcam"],
    "CD4+ T cells": ["Cd4", "Cd3d", "Themis"],
    "CD8+ T cells": ["Cd8a"],
    "Treg cells": ["Foxp3", "Il2ra"],
    "NK cells": ["Klrd1", "Il18r1"],
    "Plasma cells": ["Sdc1", "Iglc2"],
    "B cells": ["Bank1", "Cd19", "Cd74", "Ms4a1", 'Cd79a'],
    "Neutrophil": ["Csf3r", "S100a8"],
    "Macrophage": ["Cd68", "Cd163", "Mrc1", "Cd80", "Cd86", "Tgfb1", "Csf1"],
    "Monocyte": ["Tlr2", "Itgb2", "Itgam", "Ctsd", "Ctsa", "Nlrp3", "Bst1", "Stab1", "Irak3"],
    "Mast": ["Cpa3", "Kit"],
    "Dendritic (activated)": ["Fscn1", "Lamp3", "Ccl19", "Ccr7"],
    "Dendritic (conventional type 1)": ["Cst3", "Clec9a", "Lgals2", "Xcr1"],
    "Dendritic (conventional type 2)": ["Cd207", "Ndrg2", "Fcer1a", ],
    "Dendritic (plasmacytoid)": ["Irf7", "Tcf4", "Irf4", "Gzmb", "Cxcr3"],
    "Non-malignant ductal epithelial cells": ["Krt7", "Krt8", "Cftr", "Muc1", "Muc6", "Prom1", "Prss1", "Prss2", "Cpa1", "Cel"],
}


## clean up label transfer

In [None]:
sc.tl.dendrogram(adata, groupby = 'leiden', use_rep='X_scANVI')

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden', var_names = level_1_markers, layer = 'log_norm', dendrogram = True)

In [None]:
sc.tl.rank_genes_groups(adata, groupby = 'leiden', layer = 'log_norm')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, dendrogram = True)

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'larry_positive', legend_loc = 'on data')

In [None]:
anno = {
    '0':'Malignant Epithelial Cell',
    '1':'Malignant Epithelial Cell',
    '2':'Non Malignant Epithelial Cell',
    '3':'Malignant Epithelial Cell',
    '4':'Non Malignant Epithelial Cell',
    '5':'Malignant Epithelial Cell',
    '6':'Non Malignant Epithelial Cell',
    '7':'Malignant Epithelial Cell',
    '8':'Malignant Epithelial Cell',
    '9':'Myeloid',
    '10':'Cancer Associated Fibroblast',
    '11':'Cancer Associated Fibroblast',
    '12':'Cancer Associated Fibroblast',
    '13':'T Cell',
    '14':'T Cell',
    '15':'NK Cell',
    '16':'T Cell',
    '17':'T Cell',
    '18':'Myeloid',
    '19':'Plasma Cell',
    '20':'B Cell',
    '21':'Myeloid',
    '22':'B Cell',
    '23':'Myeloid',
    '24':'Endothelial Cell',
    '25':'Neutrophil',
    '26':'Myeloid',
    '28':'Adypocyte',
}

In [None]:
adata.obs['Level_1_refined'] = adata.obs.leiden.map(anno)

## T cells

In [None]:
t = adata[adata.obs.Level_1_refined == 'T Cell'].copy()

In [None]:
sc.tl.leiden(t, flavor = 'igraph', resolution = .3)

In [None]:
sc.tl.rank_genes_groups(t, groupby = 'leiden', layer = 'log_norm')

In [None]:
sc.pl.rank_genes_groups_dotplot(t, dendrogram = False)

In [None]:
sc.pl.dotplot(t, groupby = 'leiden', var_names = level_1_markers, dendrogram = False)

In [None]:
sc.pl.umap(t, color = 'leiden')

In [None]:
anno = {
    '0':'CD8 T Cell',
    '1':'Treg Cell',
    '2':'Ambiguous T Cell',
    '3':'NK Cell',
    '4':'Ambiguous T Cell',
    '5':'Ambiguous T Cell',
    '6':'B Cell',
}

In [None]:
t.obs['Level_1_refined'] = t.obs.leiden.replace(anno)
adata.obs['Level_1_refined'] = adata.obs['Level_1_refined'].astype(str)
adata.obs.loc[t.obs_names, 'Level_1_refined'] = t.obs['Level_1_refined']
adata.obs['Level_1_refined'] = adata.obs['Level_1_refined'].astype('category')


In [None]:
sc.pl.umap(adata, color = 'Level_1_refined')

## Myeloid

In [None]:
myelo = adata[adata.obs.Level_1_refined == 'Myeloid'].copy()

In [None]:
sc.tl.leiden(myelo, flavor = 'igraph', resolution = .3)

In [None]:
sc.tl.rank_genes_groups(myelo, groupby = 'leiden', layer = 'log_norm')

In [None]:
sc.pl.rank_genes_groups_dotplot(myelo, dendrogram = False)

In [None]:
sc.pl.dotplot(myelo, groupby = 'leiden', var_names = level_1_markers, layer='log_norm', dendrogram = False)

In [None]:
sc.pl.dotplot(myelo, groupby = 'leiden', var_names = ['Cd14', 'Cd68'], layer='log_norm',dendrogram = False)

In [None]:
sc.pl.umap(myelo, color = 'Cd68', layer = 'log_norm')

In [None]:
sc.pl.umap(myelo, color = 'leiden', layer = 'log_norm')

In [None]:
anno = {
    '0':'Macrophage',
    '1':'Monocyte',
    '2':'Macrophage',
    '3':'Macrophage',
    '4':'Macrophage',
    '5':'Macrophage',
    '6':'Dendritic Cell',
    '7':'Macrophage',
    '8':'Dendritic Cell',
    '9':'Monocyte',
    '10':'Monocyte',
}

In [None]:
myelo.obs['Level_1_refined'] = myelo.obs.leiden.replace(anno)
adata.obs['Level_1_refined'] = adata.obs['Level_1_refined'].astype(str)
adata.obs.loc[myelo.obs_names, 'Level_1_refined'] = myelo.obs['Level_1_refined']
adata.obs['Level_1_refined'] = adata.obs['Level_1_refined'].astype('category')


In [None]:
sc.pl.umap(adata, color = 'Level_1_refined')

In [None]:
sc.tl.leiden(adata, flavor = 'igraph', restrict_to=('Level_1_refined',['Myeloid']), key_added='leiden_myeloid', resolution = .3)

In [None]:
sc.pl.umap(adata, color = 'leiden_myeloid')

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_myeloid', var_names = level_1_markers, layer = 'log_norm')

## prepare for integration

In [None]:
adata.obs['technology'] = ['snRNA-seq' if x == 'Chen_2024' else 'scRNA-seq' for x in adata.obs.Dataset]

In [None]:
import decoupler as dc

In [None]:
manual_genes_human = pd.read_csv('../../../supplementary_data/human/human_manual_genes.csv')

In [None]:
manual_genes_human.columns = ['genesymbol','manual']
manual_genes_human['pathway'] = '_' #dummy for decoupler
manual_genes_human = manual_genes_human[manual_genes_human['manual']]

In [None]:
mouse_manual_genes = dc.translate_net(manual_genes_human, target_organism='mouse')

In [None]:
man_genes = list(mouse_manual_genes['genesymbol'].values)

In [None]:
import numpy as np

In [None]:
adata.obs

## harmonize obs

In [None]:
in_house = adata[adata.obs.provenance == 'in_house_datasets'].copy()
public = adata[adata.obs.provenance != 'in_house_datasets'].copy()

In [None]:
in_house.obs['Sample'] = in_house.obs['donor_id']
in_house.obs['Dataset'] = 'in_house'
in_house.obs['Sample_unique'] = (in_house.obs.Sample.astype(str) + "_" + in_house.obs.Dataset.astype(str)).astype('category')
in_house.obs['donor_id'] = in_house.obs['Sample_unique']

In [None]:
public.obs['donor_id'] = public.obs['Sample_unique']
public.obs['sample_ID'] = public.obs['Sample']

In [None]:
import anndata as ad
_adata = ad.concat([in_house, public], label = 'provenance', keys = ['in_house_datasets', 'public_datasets'])

In [None]:
_adata.var['Manual_genes'] = [gene in man_genes for gene in _adata.var_names]

In [None]:
_adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/11_mouse_all_integration_input.h5ad')

In [None]:
import scanpy as sc
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/11_mouse_all_integration_input.h5ad')

In [None]:
adata