In [None]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action='ignore')
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import anndata as ad
sc.set_figure_params(frameon=False)

In [None]:
adata

In [None]:
sc.__version__

In [None]:
adata = ad.read_zarr('../Finalized/scpoli_final_refined.zarr')

In [None]:
sc.pl.umap(adata, color=['Level_1_refined', 'Leiden_whole_object',], wspace=0.75)

In [None]:
adata

# QC

In [None]:
adata.obs.head()

In [None]:
early_qc = adata.obs[['n_genes', 'n_counts', 'log_counts', 'mt_frac', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito']]

In [None]:
early_qc.head()

In [None]:
cols_to_drop =['n_genes', 'n_counts', 'log_counts', 'mt_frac', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito']

In [None]:
for i in cols_to_drop:
    adata.obs[i] = early_qc[i].copy()

In [None]:
adata.obs.head()

In [None]:
# QC again manually

In [None]:
plt.rcParams['figure.figsize'] = (6,6)
sc.pl.scatter(adata, "log_counts", "log1p_n_genes_by_counts", color="Leiden_whole_object", size=1, )
sc.pl.scatter(adata, "n_counts", "n_genes_by_counts", color="Leiden_whole_object", size=1, )

# Fix Level 0

In [None]:
adata.obs.groupby(['Level_0', 'Level_1_refined']).size().unstack().T

In [None]:
df = adata.obs[['Level_0', 'Level_1_refined']]
dominant_mapping = (
    df.groupby(['Level_1_refined', 'Level_0'])
    .size()
    .unstack(fill_value=0)
    .idxmax(axis=1)
)

df['Corrected_Level_0'] = df['Level_1_refined'].map(dominant_mapping)
misclassified = df[df['Level_0'] != df['Corrected_Level_0']]


In [None]:
adata.obs.loc[misclassified.index, 'Level_0'] = misclassified['Corrected_Level_0']

In [None]:
adata.obs.groupby(['Level_0', 'Level_1_refined']).size().unstack().T

# Detect Doublets

In [None]:
import scrublet as scr

In [None]:
pwd

In [None]:
doublet_results = []
for dataset in adata.obs.Dataset.unique().tolist():
    adata_temp = adata[adata.obs.Dataset == dataset]
    print(f'Running scrublet for {dataset} with shape {adata_temp.shape}')
    adata_temp.X = adata_temp.layers['raw']
    scrub = scr.Scrublet(adata_temp.X, expected_doublet_rate=0.06)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
    df_temp = pd.DataFrame({
    'obs_names': adata_temp.obs_names,
    'Dataset': dataset,
    'doublet_scores': doublet_scores,
    'predicted_doublets': predicted_doublets
    })
    doublet_results.append(df_temp)
    
doublet_df = pd.concat(doublet_results, ignore_index=True)


In [None]:
doublet_df.set_index('obs_names', inplace=True)

In [None]:
doublet_df.predicted_doublets = doublet_df.predicted_doublets.astype(str)

In [None]:
# df2.reindex(df.index
doublet_df.reindex(adata.obs_names)
adata.obs['doublet_scores'] = doublet_df['doublet_scores'].copy()
adata.obs['predicted_doublets'] = doublet_df['predicted_doublets'].tolist()

In [None]:
adata.obs.head()

In [None]:
sc.pl.umap(adata, color=['Level_1_refined'], ncols=1)
sc.pl.umap(adata, color=['predicted_doublets'], groups='True', size=5, ncols=1)

In [None]:
adata.obs.groupby(['Dataset', 'predicted_doublets']).size().unstack()

In [None]:
adata.obs.groupby(['Level_1_refined', 'predicted_doublets']).size().unstack()

In [None]:
adata.write_zarr('../Finalized/malpoli_final_refined.zarr')

# Leiden for T Cells

In [None]:
t_cells = adata[adata.obs.Level_1 == 'T Cell'].copy()

In [None]:
t_cells

In [None]:
sc.pp.neighbors(t_cells, use_rep='X_scpoli')
sc.tl.leiden(t_cells, resolution=0.5)

In [None]:
t_cells.obsm['X_umap_global'] = t_cells.obsm['X_umap'].copy()

In [None]:
sc.tl.umap(t_cells)

In [None]:
sc.pl.umap(t_cells, color=['Level_1_refined', 'leiden'])

# Leiden for Epithelial Cells

In [None]:
mal = adata[adata.obs.Level_1_refined.isin(['Malignant', 'Ductal Cell/Malignant'])].copy()

In [None]:
mal.obs.groupby(['Dataset']).size()

In [None]:
sc.pp.neighbors(mal, use_rep='X_scpoli', n_neighbors=)
sc.tl.leiden(mal, resolution=0.5)

In [None]:
mal.obsm['X_umap_global'] = mal.obsm['X_umap'].copy()

In [None]:
sc.tl.umap(mal)

In [None]:
sc.pl.umap(mal, color=['Level_1_refined', 'leiden'])

In [None]:
sc.pp.neighbors(mal, use_rep='X_scpoli', n_neighbors=100)
sc.tl.leiden(mal, resolution=0.5)

In [None]:
mal.obsm['X_umap_global'] = mal.obsm['X_umap'].copy()

In [None]:
sc.pl.umap(mal, color=['Level_1_refined', 'leiden', 'predicted_doublets'], size=5)

In [None]:
#try to see if these clusters are outliers
from scipy.stats import median_abs_deviation
def is_outlier(data, nmads=5):
    median = np.median(data)
    mad = median_abs_deviation(data)
    return (data < median - nmads * mad) | (data > median + nmads * mad)

In [None]:
mal.obs['outlier_subset'] = False
for dataset in mal.obs['Dataset'].unique():
    for cell in mal.obs.loc[mal.obs['Dataset'] == dataset, 'Level_1_refined'].unique():
        print(f'Processing {cell} in {dataset}')
        cell_mask = (mal.obs['Dataset'] == dataset) & (mal.obs['Level_1_refined'] == cell)
        total_outliers = pd.Series(False, index=mal.obs.index) 
        outlier_total_counts = is_outlier(mal.obs.loc[cell_mask, 'log1p_total_counts'], 3)
        outlier_n_genes = is_outlier(mal.obs.loc[cell_mask, 'log1p_n_genes_by_counts'], 3)
        outlier_mito = is_outlier(mal.obs.loc[cell_mask, 'pct_counts_mito'], 3)
        combined_outlier = outlier_total_counts | outlier_n_genes | outlier_mito 
        mal.obs.loc[cell_mask, 'outlier_subset'] |= combined_outlier

In [None]:
mal.obs.outlier_subset = mal.obs.outlier_subset.astype(str)

In [None]:
mal.obs.groupby(['Dataset', 'outlier_subset']).size().unstack()['True']

In [None]:
print(mal.obs.groupby(['Dataset', 'outlier_subset']).size().unstack()['True'].sum())
print(mal.obs.groupby(['Dataset', 'outlier_subset']).size().unstack()['False'].sum())

In [None]:
mal.shape

In [None]:
mal_subset = mal[mal.obs.outlier_subset == 'False']

In [None]:
sc.pl.umap(mal, color=['Level_1_refined', 'leiden', 'Condition'], size=5, ncols=3)
sc.pl.umap(mal, color=['predicted_doublets', 'outlier_subset'], groups='True', size=5, ncols=2)

In [None]:
sc.pp.neighbors(mal_subset, use_rep='X_scpoli', n_neighbors=100)
sc.tl.leiden(mal_subset, resolution=0.5)

In [None]:
pwd

In [None]:
sc.pl.umap(mal_subset, color=['Level_1_refined', 'leiden'], size=5, ncols=2)
sc.pl.umap(mal_subset, color=['predicted_doublets', 'outlier_subset'], groups='True', size=5, ncols=2)

In [None]:
plt.rcParams['figure.figsize'] = (6,6)
sc.pl.scatter(mal, "log_counts", "log1p_n_genes_by_counts", color="Leiden_whole_object", size=1, )
sc.pl.scatter(mal, "n_counts", "n_genes_by_counts", color="Leiden_whole_object", size=1, )

In [None]:
qc_metrics = epi.obs.groupby('leiden')[['n_counts', 'n_genes', 'mt_frac']].mean()