#### Importing all the required **Python** and **R** libraries 

In [None]:
import os
import warnings

warnings.filterwarnings("ignore")
import pathlib
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from scipy.stats import median_abs_deviation


import scranPY
%load_ext autoreload
%autoreload 2
#%load_ext lab_black

#### Scanpy settings

In [None]:
sc.logging.print_header()
sc.set_figure_params(facecolor="white", figsize=(8, 8), dpi_save=600)
sc.settings.verbosity = 1

## Reading data

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/01_mouse_no_larry_raw.h5ad')

## QC

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("mt-")
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))
adata.var["hb"] = adata.var_names.str.contains('Hb')

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_mt", 5)
    | is_outlier(adata, "pct_counts_ribo", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
    | (adata.obs['total_counts'] > 100000)
    #| (adata.obs['n_genes_by_counts'] < 500)
)
adata.obs.outlier.value_counts()

In [None]:
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="outlier")

In [None]:
sc.pl.umap(adata, color=["outlier", "n_genes_by_counts", "total_counts", 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], ncols=1)

In [None]:
adata = adata[~adata.obs["outlier"]].copy()

## normalization

In [None]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers["log_norm"] = adata.X.copy()
adata.X = adata.layers["counts"].copy()

## annotation

#### Level 0

In [None]:
ad_human = sc.read_h5ad('/mnt/storage/Daniele/atlases/human/adata_all_genes_scpoli_final.h5ad', backed='r')

In [None]:
level_0_cats_human = ad_human.obs['Level_0'].unique().tolist()

In [None]:
del ad_human

In [None]:
sc.tl.leiden(adata, resolution=0.1, key_added="leiden_0.1")

In [None]:
adata = adata[adata.obs.leiden.isin([str(i) for i in range(11)])].copy()

In [None]:
markers_level_0 = {
    'Immune': 'Ptprc',
    'Stromal': 'Dcn',
    'Epithelial': 'Krt18',
    'Neuronal': ['Tubb3', 'Chat'],
    'Endocrine': 'Chga',
}

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_0.1', layer='log_norm')
sc.pl.rank_genes_groups_dotplot(adata, n_genes=10, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=False)

In [None]:
sc.pl.dotplot(adata, groupby='leiden_0.1', var_names=markers_level_0, layer='log_norm')

In [None]:
level_0_cats_human

In [None]:
anno = {
    "0": "Immune Cell",
    "1": "Epithelial/Malignant Cell",
    "2": "Immune Cell",
    "3": "Epithelial/Malignant Cell",
    "4": "Immune Cell",
    "5": "Stromal Cell",
    "6": "Immune Cell",
    "7": "Immune Cell",
    "8": "Immune Cell",
    "9": "Stromal Cell",
    "10": "Immune Cell",
}

In [None]:
adata.obs['Level_0'] = adata.obs['leiden_0.1'].map(anno)

In [None]:
sc.pl.umap(adata, color = 'Level_0')

#### Level 1

In [None]:
sc.tl.leiden(adata, resolution=0.5, key_added="leiden_0.5")

In [None]:
sc.pl.umap(adata, color = ['Level_0','leiden_0.5'], legend_loc='on data')

In [None]:
adata

In [None]:
barcodes_leiden = adata.obs[['leiden_0.5', 'larry_positive']].copy()

In [None]:
(barcodes_leiden.groupby('leiden_0.5')['larry_positive'].value_counts()/barcodes_leiden.groupby('leiden_0.5')['larry_positive'].count() * 100).plot(kind='bar', figsize=(20, 5))

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_0.5', layer='log_norm')
sc.tl.dendrogram(adata, groupby='leiden_0.5', use_rep='X_scVI')
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, values_to_plot='logfoldchanges', vmin=-5, vmax=5, min_logfoldchange=2, cmap='coolwarm', dendrogram=True)

In [None]:
anno = {
    "0": "Myeloid Cell",
    "1": "Malignant",
    "2": "T Cell",
    "3": "Malignant",
    "4": "Myeloid Cell",
    "5": "T Cell",
    "6": "Myeloid Cell",
    "7": "B Cell",
    "8": "Fibroblasts",
    "9": "Myeloid Cell",
    "10": "Myeloid Cell",
    "11": "Malignant",
    "12": "Myeloid Cell",
    "13": "Malignat",
    "14": "Ductal/Malignant",
    "15": "NK Cell",
    "16": "T Cell",
    "17": "Myeloid Cell",
    "18": "Endothelial Cell",
    "19": "Malignant",
    "20": "Endothelial Cell",
    "21": "Malignant",
    "22": "Myeloid Cell",
    "23": "Malignant",
}

In [None]:
adata.obs['Level_1'] = adata.obs['leiden_0.5'].map(anno)

In [None]:
sc.pl.umap(adata, color = 'Level_1')

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/02_mouse_no_larry_qced.h5ad')