#### Importing all the required **Python** and **R** libraries 

In [None]:
import os
import warnings

warnings.filterwarnings("ignore")
import pathlib
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from scipy.stats import median_abs_deviation


%load_ext autoreload
%autoreload 2
#%load_ext lab_black

#### Scanpy settings

In [None]:
sc.logging.print_header()
sc.set_figure_params(facecolor="white", figsize=(8, 8), dpi_save=600)
sc.settings.verbosity = 1

## Reading data

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/08_mouse_public_raw.h5ad')

## QC

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("mt-")
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))
adata.var["hb"] = adata.var_names.str.contains('Hb')
adata.var['malat'] = adata.var_names.str.contains('Malat')

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb", "malat"], inplace=True, percent_top=[20], log1p=True
)

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    outlier_flags = pd.Series(False, index=adata.obs_names)

    for batch in adata.obs['Dataset'].unique():
        batch_idx = adata.obs['Dataset'] == batch
        M = adata.obs.loc[batch_idx, metric]

        med = np.median(M)
        mad = median_abs_deviation(M)

        # Compute outliers for this batch
        outlier = (M < med - nmads * mad) | (M > med + nmads * mad)
        outlier_flags.loc[batch_idx] = outlier

    return outlier_flags

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_mt", 5)
    | is_outlier(adata, "pct_counts_ribo", 5)
    | is_outlier(adata, "pct_counts_malat", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
    | (adata.obs['total_counts'] > 100000)
    | (adata.obs['n_genes_by_counts'] < 500)
)
adata.obs.outlier.value_counts()

In [None]:
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="outlier")

In [None]:
adata = adata[~adata.obs["outlier"]].copy()

## normalization

In [None]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers["log_norm"] = adata.X.copy()
adata.X = adata.layers["counts"].copy()

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse/09_mouse_public_qced.h5ad')