In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from pyclustree import clustree
import matplotlib.pyplot as plt

In [None]:
sc.set_figure_params(dpi=100)
pd.set_option('display.max_rows', 10)

In [None]:
adata = sc.read_h5ad('../PDAC_Final/Downstream/final_scanVI/final_object_all_genes.h5ad')

In [None]:
adata

In [None]:
adata.obs.head()

In [None]:
adata.obs.Level_3.unique().tolist()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import scanpy as sc

def classify_celltypes_by_score(adata, markers_dict, embedding_key="scanvi_emb", layer="log_norm", score_threshold=0.0, knn_k=25, score_prefix="", output_obs_col="celltype_knn", plot=True):
    
    sorted_markers = dict(sorted(markers_dict.items()))
    celltypes = list(sorted_markers.keys())

    # 1. Score genes
    print('Score genes')
    for cell_type, markers in sorted_markers.items():
        print(f'Scoring {cell_type}')
        sc.tl.score_genes(
            adata,
            gene_list=markers,
            score_name=f"{score_prefix}{cell_type}",
            layer=layer
        )

    # 2. Assign highest scoring cell type
    print('Assign highest scoring cell type')
    scores = adata.obs[[f"{score_prefix}{ct}" for ct in celltypes]].values
    max_indices = np.argmax(scores, axis=1)
    max_scores = scores[np.arange(scores.shape[0]), max_indices]

    initial_labels = np.array(celltypes)[max_indices]
    initial_labels[max_scores < score_threshold] = "Missclassified"
    adata.obs["celltype"] = initial_labels

    # 3. Train kNN on confident cells
    print('Train kNN on confident cells')
    confident_mask = adata.obs["celltype"] != "Missclassified"
    X_train = adata.obsm[embedding_key][confident_mask]
    y_train = adata.obs["celltype"][confident_mask].values

    X_test_mask = ~confident_mask
    X_test = adata.obsm[embedding_key][X_test_mask]

    knn = KNeighborsClassifier(n_neighbors=knn_k)
    knn.fit(X_train, y_train)

    predicted_labels = knn.predict(X_test)
    probs = knn.predict_proba(X_test)
    confidences = probs.max(axis=1)

    # 4. Store results
    print('Store results')
    adata.obs[output_obs_col] = adata.obs["celltype"].copy()
    adata.obs.loc[X_test_mask, output_obs_col] = predicted_labels
    adata.obs.loc[X_test_mask, "knn_confidence"] = confidences
    adata.obs[output_obs_col] = adata.obs[output_obs_col].astype(str)

    # 5. Dotplot
    print('Plotting')
    if plot:
        sc.pl.dotplot(
            adata,
            groupby=output_obs_col,
            var_names=sorted_markers,
            dendrogram=False,
            layer=layer,
            standard_scale="var",
            categories_order=sorted(adata.obs[output_obs_col].unique()))
        
        sc.pl.dotplot(
            adata, 
            groupby=output_obs_col, 
            var_names= list(sorted_markers.keys()), 
            layer='normcounts', 
            standard_scale='var', 
            categories_order=sorted(adata.obs[output_obs_col].unique()))

    return adata

In [None]:
# Malignant

In [None]:
mal_markers = {
    "Malignant Cell - Epithelial": ["EPCAM", "CLDN4", "CLDN7"],
    "Malignant Cell - Pit Like": ["GKN1", "GKN2", "CLDN18"],
    "Malignant Cell - Hypoxia": ["HIF1A", "VEGFA", "CA9"],
    "Malignant Cell - Highly Proliferative": ["MKI67", "CENPF", "TOP2A"],
    "Malignant Cell - EMT": ["ZEB1", "TWIST1", "CDH2"],
    "Malignant Cell - Acinar-like": ["REG3A", "REG3G", "CPA1"],
    "Malignant Cell - Invasive": ["MMP9", "MMP2", "MMP14"],
    "Malignant Cell - Senescence": ["CDKN1A", "CDKN2A", "LMNA"],
    "Malignant Cell - Apoptotic": ["BAX", "BCL2", "FAS"],
    "Malignant Cell - Mesenchymal": ["THY1", "COL3A1", 'FN1'],
}

In [None]:
malignant = adata[adata.obs['Level_3'].isin(['Malignant Cell - Epithelial', 'Malignant Cell - Mesenchymal', 'Malignant Cell - EMT'])] #.copy()

In [None]:
malignant = classify_celltypes_by_score(
    adata=malignant,
    markers_dict=mal_markers,
    embedding_key="scanvi_emb",
    layer="log_norm",
    score_threshold=0.0,
    knn_k=25
)

In [None]:
malignant.obs.head()

In [None]:
malignant.obs.celltype_knn.value_counts()

In [None]:
# add to adata
mask = malignant.obs_names
adata.obs.Level_4 = adata.obs.Level_4.astype(str)
# Assign the new label
adata.obs.loc[mask, 'Level_4'] = malignant.obs['celltype_knn'].reindex(mask)

In [None]:
adata.obs.head()

# T Cells

In [None]:
list_t = adata.obs.Level_3.unique().tolist()

In [None]:
[i for i in list_t if 'T' in i]

In [None]:
cd8_subsets = {
   "CD8+ Effector T Cell": ["GZMB", "GZMK", "PRF1", "IFNG"],
   "CD8+ Exhausted T Cell": ["PDCD1", "HAVCR2", "LAG3", "TOX"],
   "CD8+ Memory T Cell": ["CCR7", "CD27", "SELL"],
   "CD8+ Naive T Cell": ["SELL", "CCR7", "LEF1"],
   "CD8+ Terminal Effector T Cell": ["ZEB2", "GZMB", "IFNG", "TBX21"],
   "CD8+ Tissue-Resident Memory T Cell": ["ITGAE", "CXCR6", "ZNF683"]}

In [None]:
cd4_subsets = {
   "CD4+ Th1 Cell": ["STAT4", "CXCR3", "IFNG"],
   "CD4+ Th2 Cell": ["GATA3", "CCR4", "PTGDR2"],
   "CD4+ Th17 Cell": ["IL17A", "IL17F", "RORC", "KLRB1", "CCR6"],
   "CD4+ Th22 Cell": ["IL22", "CCR10", "FOXO4"],
   "CD4+ Naive Cell": ["CCR7", "SELL", "LEF1", "TCF7"],
   "CD4+ Central Memory T Cell": ["GPR183", "TCF7", "SELL"],  # "IL7R", "TCF7"
   "γδ T Cell (Vδ1)": ["TRDC"],
   "T-reg": ["FOXP3", "IL2RA", "CTLA4", "TNFRSF18"], #"IKZF2", 
   "Double Positive CD4+CD8+ T Cell": ["CD4", "CD8A", "CD8B"]}

In [None]:
cd4_t = adata[adata.obs.Level_3.isin(['Mixed T Cell', 'CD4+ T Cell'])] #.copy()

In [None]:
cd4_t

In [None]:
cd4_t = classify_celltypes_by_score(
    adata=cd4_t,
    markers_dict=cd4_subsets,
    embedding_key="scanvi_emb",
    layer="log_norm",
    score_threshold=0.0,
    knn_k=25
)

In [None]:
cd4_t.obs.celltype_knn.value_counts()

In [None]:
# add to adata
mask = cd4_t.obs_names
adata.obs.Level_4 = adata.obs.Level_4.astype(str)
# Assign the new label
adata.obs.loc[mask, 'Level_4'] = cd4_t.obs['celltype_knn'].reindex(mask)

In [None]:
cd8_t = adata[adata.obs.Level_3.isin(['CD8+ T Cell'])] #.copy()

In [None]:
cd8_t = classify_celltypes_by_score(
    adata=cd8_t,
    markers_dict=cd8_subsets,
    embedding_key="scanvi_emb",
    layer="log_norm",
    score_threshold=0.0,
    knn_k=25
)

In [None]:
cd8_t.obs.celltype_knn.value_counts()

In [None]:
# add to adata
mask = cd8_t.obs_names
adata.obs.Level_4 = adata.obs.Level_4.astype(str)
# Assign the new label
adata.obs.loc[mask, 'Level_4'] = cd8_t.obs['celltype_knn'].reindex(mask)

# Endothelial Cells

In [None]:
endothelial_markers = {
    "Tumor-Associated Endothelial Cell": ["DDIT4", "TIE1", "SEMA6B", "PLCB1", "LYZ"],
    "Vascular Endothelial Cell": ["PECAM1", "CDH5", "PLVAP", "EHD4", "CLEC14A"],
    "Lymphatic Endothelial Cell": ["PROX1", "PDPN", "LYVE1", "FLT4"]
}

In [None]:
endothelial = adata[adata.obs.Level_3.str.contains('Endothelial')]

In [None]:
endothelial

In [None]:
endothelial = classify_celltypes_by_score(
    adata=endothelial,
    markers_dict=endothelial_markers,
    embedding_key="scanvi_emb",
    layer="log_norm",
    score_threshold=0.0,
    knn_k=25
)

In [None]:
# add to adata
mask = endothelial.obs_names
adata.obs.Level_4 = adata.obs.Level_4.astype(str)
# Assign the new label
adata.obs.loc[mask, 'Level_4'] = endothelial.obs['celltype_knn'].reindex(mask)

# B cells

In [None]:
b_cells = adata[adata.obs.Level_3.isin(['B Cell', 'Plasma Cell'])]

In [None]:
b_cells

In [None]:
b_markers = {
    "B Cell - Naive": ["IL7R", "IGHM", "TCL1A", "CD19"],
    "B Cell - Activated": ["IGHM", "CD69", "CD86"],
    "B Cell - Memory": ["CD27", "IGHE", "IGHA1"],
    "B-reg": ["TFRC", "CD44", "TGFB1"],
    "Plasma Cell": ["MZB1", "XBP1", "PRDM1", "SDC1"],
    "Plasmablast": ["CD27", "CD38", "PRDM1", "IGHG1", "MKI67"],
    "B Cell - Germinal Center": ["AICDA", "BCL6", "RGS13", "S1PR2"]
}

In [None]:
b_cells = classify_celltypes_by_score(
    adata=b_cells,
    markers_dict=b_markers,
    embedding_key="scanvi_emb",
    layer="log_norm",
    score_threshold=0.0,
    knn_k=25
)

In [None]:
b_cells.obs.celltype_knn.value_counts()

In [None]:
# add to adata
mask = b_cells.obs_names
adata.obs.Level_4 = adata.obs.Level_4.astype(str)
# Assign the new label
adata.obs.loc[mask, 'Level_4'] = b_cells.obs['celltype_knn'].reindex(mask)

In [None]:
plt.rcParams['figure.figsize'] = (8,8)
sc.pl.umap(adata, color='Level_4', size=5)

# Final Back Track

In [None]:
df_map = pd.read_csv('../PDAC_Final/Downstream/Level_4_to_Level_1.csv', index_col=None, sep=';')

In [None]:
df_map

In [None]:
obs = adata.obs.copy()

In [None]:
obs.drop(['Level_1', 'Level_2', 'Level_3'], axis=1, inplace=True)

In [None]:
obs['Level_1'] = obs.Level_4.map(dict(zip(df_map.Level_4,df_map.Level_1)))
obs['Level_2'] = obs.Level_4.map(dict(zip(df_map.Level_4,df_map.Level_2)))
obs['Level_3'] = obs.Level_4.map(dict(zip(df_map.Level_4,df_map.Level_3)))

In [None]:
obs = obs[['Barcode', 'Dataset', 'ID_batch_covariate', 'Unique_ID', 'Technology',
       'n_genes', 'n_counts', 'log_counts', 'mt_frac', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito',
       'leiden', 'batch', 'leiden_0.2', 'leiden_0.2_annotation',
       'leiden_subcluster', 'level0_leiden_subcluster', 'leiden_0.5',
       'is_outlier_total_counts', 'outlier', 'infercnv_score_malignant',
       'infercnv_score_malignant_refined', 'cnv_score_abs', 'treatment_status',
       'Level_0', 'MALAT1_lognorm', 'empty_droplet', 'ID_harmonised',
       'Dataset_unique', 'Tissue', 'Age', 'Sex', 'Diabetes', 'Treatment',
       'Global_Leiden', 'Treatment_Harmonized', 'Treatment_Category',
       'Myeloid_leiden_0.75', 'Fibroblast_leiden_0.75', 'Lymphoid_leiden_0.75',
       'Endothelial_Cell_leiden_0.75', 'Malignant_leiden_0.75',
       'Ductal_Cell_leiden_0.75', 'Schwann_Cell_leiden_0.75',
       'Adipocyte_leiden_0.75', 'Endocrine_Cell_leiden_0.75',
       'Acinar_Cell_leiden_0.75', 'Pericyte_leiden_0.75',
       'Smooth_Muscle_Cell_leiden_0.75', 'NK_Cell_leiden_0.75', 'Condition',
       'combo', 'EMT category', 'EMT score', 'EMT_score_DL',
       'Suspicious_Normal',  'Level_1', 'Level_2', 'Level_3', 'Level_4', '_scvi_batch', '_scvi_labels',
      ]]

In [None]:
pd.set_option('display.max_rows', None)
obs.groupby('Level_4').size()

In [None]:
obs.head()

In [None]:
adata.obs = obs.copy()

# Save

In [None]:
adata.write('../PDAC_Final/Downstream/final_scanVI/final_object_all_genes.h5ad')

In [None]:
adata[:, adata.var.Manual_Genes]

In [None]:
adata_mg = adata[:, adata.var.Manual_Genes].copy()

In [None]:
adata_mg.write('../PDAC_Final/Downstream/final_scanVI/final_object_mg.h5ad')

In [None]:
pwd