In [None]:
pwd

In [None]:
import anndata as ad
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
sc.set_figure_params(frameon=False, dpi=120)

In [None]:
adata = sc.read_h5ad('../Binned_Data/adata_scanvi_all_genes_subclustered.h5ad')
final_obs = pd.read_csv('pdac_atlas_annotation.csv', index_col='Dataset_Barcode')
biological_metadata = pd.read_csv('biological_metadata', index_col='Dataset_Barcode')

In [None]:
adata.obs['Condition'] = np.where(adata.obs.Dataset.str.contains('Norm'), 'Normal', 'PDAC')
adata.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
adata.obs.head()

In [None]:
adata.obs.columns[adata.obs.columns.str.contains('Level')]

In [None]:
adata.obs.drop(['Level_1_x', 'Level_1_refined', 'Level_1_y', 'Level_2',
       'Level_3'], axis=1, inplace=True)

In [None]:
adata.obs = pd.merge(adata.obs, final_obs, on='Dataset_Barcode').copy()

In [None]:
cluster_keys = ['Level_1', 'Level_2', 'Level_3', 'Level_4']

In [None]:
sc.pl.umap(adata, color=cluster_keys, ncols=4,
          legend_loc='on data', legend_fontsize=4, legend_fontoutline=3)

In [None]:
sc.pl.umap(adata, color='Level_4', ncols=4,  groups=['Mixed', 'Classical', 'Basal'],
          legend_loc='on data', legend_fontsize=4, legend_fontoutline=3)

In [None]:
sc.pl.umap(adata, color=['EMT category', 'EMT score', 'combo'], ncols=2, wspace=0.5)

In [None]:
adata.obs.Level_1 = adata.obs.Level_1.replace('Acinar', 'Acinar Cell')

In [None]:
from pyclustree import clustree

In [None]:
fig = clustree(
    adata,
    cluster_keys,
    title="Clustree",
    edge_weight_threshold=0.00,  # show all transitions
    show_fraction=False, 
    node_size_range=(5000, 10000), x_spacing = 4
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(50, 15)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

In [None]:
adata.obs.groupby(['Level_1', 'Level_4']).size().unstack()

In [None]:
plt.rcParams['figure.figsize'] = (8,8)
sc.pl.umap(adata, color='Level_4', legend_loc='on data', legend_fontsize=4, legend_fontoutline=3, size=2)

In [None]:
plt.rcParams['figure.figsize'] = (4,4)
sc.pl.umap(adata[adata.obs.Level_1 == 'Malignant'], color=['Technology', 'Level_4'])

In [None]:
plt.rcParams['figure.figsize'] = (4,4)
sc.pl.umap(adata[adata.obs.Level_1 == 'Fibroblast'], color=['Technology', 'Level_4'])

# Resolve Ambiguous Myeloid

In [None]:
myeloid = adata[adata.obs.Level_1.str.contains('Myeloid')].copy()

In [None]:
myeloid

In [None]:
sc.pp.neighbors(myeloid, use_rep='scanvi_emb')
sc.tl.leiden(myeloid, resolution=0.5)
sc.tl.umap(myeloid)

In [None]:
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)

In [None]:
myeloid.obs.groupby('Level_4').size()

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
myeloid.var["mt"] = myeloid.var_names.str.startswith("MT-")
# ribosomal genes
myeloid.var["ribo"] = myeloid.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
myeloid.var["hb"] = myeloid.var_names.str.contains("^HB[^(P)]")

In [None]:
genes_to_remove = myeloid.var["mt"] | myeloid.var["ribo"] | myeloid.var["hb"]

In [None]:
myeloid = myeloid[:, ~genes_to_remove].copy()

In [None]:
sc.pl.umap(myeloid, color=['Level_4', 'leiden'], layer='log_norm', legend_loc='on data', legend_fontoutline=2, legend_fontsize=5)

In [None]:
myeloid.obs.Level_4.isna().sum()

In [None]:
myeloid.obs.groupby(['Level_4', 'leiden']).size().unstack().T.sort_values(by='Ambiguous_Myeloid', ascending=False)

In [None]:
myeloid.obs['leiden']= myeloid.obs['leiden'].astype(str)
myeloid[myeloid.obs.Level_4 == 'Ambiguous_Myeloid'].obs.groupby('leiden').size()

In [None]:
sc.tl.rank_genes_groups(myeloid, groupby='leiden', layer='log_norm')

In [None]:
df_myeloid = pd.DataFrame(myeloid.uns['rank_genes_groups']['names'])

### amb clusters are 10, 12, 14, 15, 16

In [None]:
df_myeloid[['10', '12', '14', '15', '16']].head(30)

In [None]:
df_myeloid[['10', '12', '14', '15', '16']].head(5) # redone

In [None]:
amb_map = {'10': 'Malignant', '12': 'myCAF', '14': 'Malignant','15': 'T Cell', '16': 'Endothelial'}
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)
amb_mask = myeloid.obs['Level_4'] == 'Ambiguous_Myeloid'
myeloid.obs.loc[amb_mask, 'Level_4'] = (
    myeloid.obs.loc[amb_mask, 'leiden'].map(amb_map).fillna('Ambiguous_Myeloid')
)

In [None]:
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)
myeloid.obs.groupby('Level_4').size()

In [None]:
sc.pl.umap(myeloid, color=['Level_4', 'leiden'], layer='log_norm', size=3,
          legend_loc='on data', legend_fontoutline=2, legend_fontsize=6)

In [None]:
myeloid.obs['leiden']= myeloid.obs['leiden'].astype(str)

In [None]:
myeloid.obs.groupby(['Level_4', 'leiden']).size().unstack().T.sort_values(by='Ambiguous_Myeloid', ascending=False)

In [None]:
# Function to find mode excluding 'Ambiguous_Myeloid'
def get_majority_label(group):
    non_ambiguous = group[group != 'Ambiguous_Myeloid']
    if not non_ambiguous.empty:
        return non_ambiguous.mode().iloc[0]
    else:
        return 'Ambiguous_Myeloid'  # fallback if all ambiguous

# Build mapping from cluster to majority label
majority_map = (
    myeloid.obs
    .groupby('Myeloid_leiden_0.75')['Level_4']
    .apply(get_majority_label)
)

In [None]:
# Replace ambiguous entries
amb_mask = myeloid.obs['Level_4'] == 'Ambiguous_Myeloid'
myeloid.obs.loc[amb_mask, 'Level_4'] = (
    myeloid.obs.loc[amb_mask, 'leiden'].map(majority_map).fillna('Ambiguous_Myeloid')
)

In [None]:
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)
myeloid.obs.groupby('Level_4').size()

In [None]:
sc.pl.umap(myeloid, color=['Level_4'], layer='log_norm', size=3,
          legend_loc='on data', legend_fontoutline=2, legend_fontsize=6)

In [None]:
myeloid.obs.groupby(['Level_4', 'leiden']).size().unstack().T #.sort_values(by='Ambiguous_Myeloid', ascending=False)

In [None]:
# Add the refined col
adata.obs['Level_4_refined'] = adata.obs.Level_4.copy()

In [None]:
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)
myeloid.obs.groupby('Level_4').size()

In [None]:
# Add to main adata
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype(str)
myeloid.obs.Level_4 = myeloid.obs.Level_4.astype(str)
adata.obs.loc[myeloid.obs_names, 'Level_4_refined'] = myeloid.obs['Level_4']

In [None]:
plt.rcParams['figure.figsize'] = (6,6)
sc.pl.umap(adata, color=['Level_4', 'Level_4_refined'], ncols=4,
          legend_loc='on data', legend_fontsize=4, legend_fontoutline=3)

In [None]:
adata.obs.groupby('Level_4_refined').size()

In [None]:
cluster_keys = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_4_refined']

In [None]:
fig = clustree(
    adata,
    cluster_keys,
    title="Clustree",
    edge_weight_threshold=0.00,  # show all transitions
    show_fraction=False, 
    node_size_range=(5000, 10000), x_spacing = 4
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(50, 15)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

# Resolve Malignant into Epithelial Mesenchymal

In [None]:
mal_markers_human = {
 "Malignant_Cell_Epithelial_conc": [
        "EPCAM",    # cell surface
        "CDH1",     # E-cadherin
        "KRT8",     # cytokeratin
        "KRT18",    # cytokeratin
        "MUC1",
    ],
    "Malignant_Cell_Mesenchymal_conc": [
        "VIM",      # vimentin
        "CDH2",     # N-cadherin
        "FN1",      # fibronectin
        "ZEB1",     # EMT TF, more consistently detected than ZEB2
        "TWIST1",   # EMT TF   
    ],
    "Malignant_Cell_Epithelial": [
        "EPCAM",
        "CDH1",
        "KRT18",
        "KRT8",
        "KRT19",
        "MUC1",
        "OCLN",
    ],
    "Malignant_Cell_Mesenchymal": [
        "VIM",
        "CDH2",
        "FN1",
        "ZEB1",
        "ZEB2",
        "SNAI1",
        "SNAI2",
        "TWIST1",],
    "Malignant_Cell_Epithelial_Extended": [
        "EPCAM",    # epithelial cell adhesion
        "CDH1",     # E-cadherin
        "KRT18",    # cytokeratin
        "KRT8",     # cytokeratin
        "KRT19",    # cytokeratin
        "MUC1",     # secretory epithelial
        "OCLN",     # tight junction
        "CLDN3",    # claudin
        "CLDN4",    # claudin
        "TJP1",     # ZO-1, tight junction
        "GRHL2",    # anti-EMT transcription factor
        "OVOL2"     # anti-EMT transcription factor
        ],
    "Malignant_Cell_Mesenchymal_Extended": [
        "VIM",      # vimentin
        "CDH2",     # N-cadherin
        "FN1",      # fibronectin
        "ZEB1",     # EMT TF
        "ZEB2",     # EMT TF
        "SNAI1",    # EMT TF
        "SNAI2",    # EMT TF
        "TWIST1",   # EMT TF
        "MMP2",     # ECM degradation
        "MMP9",     # ECM degradation
        "ITGA5",    # integrin, adhesion
        "ITGB1"     # integrin, adhesion
    ]
}

In [None]:
mal_markers_human.keys()

In [None]:
malignant = adata[adata.obs.Level_4_refined.isin(['Basal', 'Classical', 'Mixed', 'Malignant']) & (adata.obs.Condition == 'PDAC')].copy()

In [None]:
malignant #.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
malignant.obs.groupby(['Dataset', 'Level_4_refined']).size().unstack()

In [None]:
sc.pl.dotplot(malignant, groupby='Malignant_leiden_0.75', var_names=mal_markers_human, layer='log_norm', standard_scale='var', title='')

In [None]:
sc.pl.dotplot(malignant, groupby='Dataset', var_names=mal_markers_human, layer='log_norm', standard_scale='var', title='')
# sc.pl.dotplot(malignant, groupby='Dataset', var_names=classical_markers, layer='log_norm', standard_scale='var', title='Basal Markers')

In [None]:
score_col_list = []
for name,list_markers in mal_markers_human.items():
    score_name = name + '_score'
    score_col_list.append(score_name)
    print(f'Computing {score_name}')
    sc.tl.score_genes(malignant, gene_list=list_markers, score_name=score_name, layer='log_norm')
    print('-'*50)

In [None]:
malignant.obs["EMT_score_conc"] = malignant.obs["Malignant_Cell_Epithelial_conc_score"] - malignant.obs["Malignant_Cell_Mesenchymal_conc_score"]
malignant.obs["EMT_score_DL"] = malignant.obs["Malignant_Cell_Epithelial_score"] - malignant.obs["Malignant_Cell_Mesenchymal_score"]
malignant.obs["EMT_score_DL_extended"] = malignant.obs["Malignant_Cell_Epithelial_Extended_score"] - malignant.obs["Malignant_Cell_Mesenchymal_Extended_score"]

In [None]:
malignant.obsm['X_umap'] = malignant.obsm['Malignant_umap']

In [None]:
plt.rcParams['figure.figsize'] = (8,6)
sc.pl.umap(malignant, color=['Malignant_leiden_0.75', 'Technology', 'Level_3', 'Treatment'], 
           wspace=0.25, size=3, ncols=4, vcenter=0)
sc.pl.umap(malignant, color=score_col_list, 
           wspace=0.25, size=3, ncols=4, vcenter=0)


In [None]:
for score_col in score_col_list:
    print(f'{score_col}')
    print(malignant.obs[score_col].agg(['min', 'max']))
    print('-'*50)

In [None]:
malignant.obs.columns[malignant.obs.columns.str.contains('EMT_')]

In [None]:
import seaborn as sns
sns.kdeplot(malignant.obs["EMT_score_conc"], label="Concise")
sns.kdeplot(malignant.obs["EMT_score_DL"], label="Standard")
sns.kdeplot(malignant.obs["EMT_score_DL_extended"], label="Extended")
plt.axvline(0, color='black', linestyle='--')
plt.legend()
plt.title("EMT score distribution comparison")

In [None]:
malignant.obs["EMT_score_DL"].median()

In [None]:
for emt_score in ['EMT_score_conc', 'EMT_score_DL', 'EMT_score_DL_extended']:
    sns.histplot(malignant.obs[emt_score], bins=100, kde=True)
    plt.axvline(0, color='red', linestyle='--')
    # plt.axvline(-0.5, color='blue', linestyle='--')
    plt.title("Distribution of EMT Score")

In [None]:
sc.pl.umap(malignant, color=['EMT_score_conc', 'EMT_score_DL', 'EMT_score_DL_extended'], cmap='bwr', vmin=-3, vmax=3, size=5)

In [None]:
malignant.obs['EMT_subtype'] = np.where(malignant.obs.EMT_score_DL > -0.5, 'Malignant - Epithelial', 'Malignant - Mesenchymal')

In [None]:
malignant.obs.groupby('EMT_subtype').size()

In [None]:
from itertools import islice

dict(islice(mal_markers_human.items(), 2,4))

In [None]:
# sc.pl.heatmap(malignant, groupby='EMT_subtype', var_names=mal_markers_human, layer='log_norm', standard_scale='var')
# Generate heatmap and capture Axes
axes_dict = sc.pl.heatmap(
    malignant,
    groupby='EMT_subtype',
    var_names=dict(islice(mal_markers_human.items(), 2,4)),
    layer='log_norm',
    standard_scale='var',
    cmap='viridis', 
    show=False,  # So we can customize
    figsize=(8,8)
)

# Extract the heatmap axis
ax = axes_dict['heatmap_ax']

# Compute where to draw separator lines
group_order = malignant.obs['EMT_subtype'].cat.categories
group_sizes = malignant.obs['EMT_subtype'].value_counts().loc[group_order]
group_boundaries = group_sizes.cumsum()[:-1]

for boundary in group_boundaries:
    # Draw a thicker white line to overwrite the black one
    ax.axhline(boundary, color='white', linewidth=5, zorder=10)

plt.tight_layout()
plt.show()

In [None]:
sc.pl.dotplot(malignant, groupby='EMT_subtype', var_names=mal_markers_human, layer='log_norm',)

In [None]:
colors = ['#b2182b', '#ef8a62', '#f7f7f7', '#67a9cf', '#2166ac']

# Step 3: Bin each cell into EMT category
# Step 4: Plot histogram with colored bars
plt.figure(figsize=(10, 5))
sns.histplot(
    data=malignant.obs,
    x='EMT_score_DL',
    hue='EMT_subtype',
    bins=100,
    palette=colors,
    multiple='stack'
)
plt.axvline(-0.5, color='black', linestyle='--')
plt.title("Distribution of EMT_score_DL Colored by EMT Subtype")
plt.xlabel("EMT_score_conc")
plt.ylabel("Cell Count")
plt.tight_layout()
plt.show()

In [None]:
# Add to main adata
malignant.obs.EMT_subtype = malignant.obs.EMT_subtype.astype(str)
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype(str)
# Ensure the index alignment is preserved
adata.obs.loc[malignant.obs_names, 'Level_4_refined'] = malignant.obs['EMT_subtype'].reindex(malignant.obs_names)
adata.obs['EMT_score_DL'] = np.nan
adata.obs.loc[malignant.obs_names, 'EMT_score_DL'] = malignant.obs['EMT_score_DL'].reindex(malignant.obs_names)

In [None]:
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype('category')

In [None]:
adata.obs['Level_4_refined'] = adata.obs.Level_4_refined.replace('Malignant', 'Malignant - Epithelial')

In [None]:
adata.obs.groupby('Level_4_refined').size() #redone

In [None]:
adata.obs.groupby('Level_4_refined').size()

# Sub Categorize

In [None]:

# bins = [-np.inf, -2.0, -0.5, 0.1, 0.75, np.inf]
x = malignant.obs["EMT_score_DL"]
mu, sigma = x.mean(), x.std()

# bins = [-np.inf, mu - 1.75*sigma, mu - 0.5*sigma, mu + 0.25*sigma , mu + 1.25*sigma, np.inf]
bins = [-np.inf, -1.25, -0.5, 0, 1.0, np.inf]
labels = ['Strong Mesenchymal','Mesenchymal', 'Hybrid', 'Epithelial', 'Strong Epithelial']

malignant.obs["EMT_subtype_finer"] = pd.cut(x, bins=bins, labels=labels)

In [None]:
sns.histplot(malignant.obs['EMT_score_DL'], bins=100, kde=True)
for bin in bins:
    plt.axvline(bin, color='red', linestyle='--')
# plt.axvline(-0.5, color='blue', linestyle='--')
plt.title("Distribution of Epi - Mes z-score difference")

In [None]:
colors = ['#b2182b', '#ef8a62', '#f7f7f7', '#67a9cf', '#2166ac']

# Step 3: Bin each cell into EMT category
# Step 4: Plot histogram with colored bars
plt.figure(figsize=(10, 5))
sns.histplot(
    data=malignant.obs,
    x='EMT_score_conc',
    hue='EMT_subtype_finer',
    bins=100,
    palette=colors,
    multiple='stack'
)
for bin in bins:
    plt.axvline(bin, color='red', linestyle='--')
plt.title("Distribution of EMT_score_z Colored by EMT Subtype")
plt.xlabel("EMT_score_conc")
plt.ylabel("Cell Count")
plt.tight_layout()
plt.show()

In [None]:
malignant.obs.groupby('EMT_subtype_finer').size()

In [None]:
# marker_dict= {'Basal_Markers': basal_markers, 'Classical_Markers': classical_markers}

In [None]:
sc.pl.dotplot(malignant, groupby='EMT_subtype_finer', var_names=mal_markers_human, layer='log_norm', standard_scale='var')
# sc.pl.dotplot(malignant, groupby='EMT_subtype_finer', var_names=mal_markers_human, layer='log_norm', standard_scale='var')

In [None]:
# sc.pl.heatmap(malignant, groupby='EMT_subtype', var_names=mal_markers_human, layer='log_norm', standard_scale='var')
# Generate heatmap and capture Axes
axes_dict = sc.pl.heatmap(
    malignant,
    groupby='EMT_subtype_finer',
    var_names=dict(islice(mal_markers_human.items(), 2,4)),
    layer='log_norm',
    standard_scale='var',
    cmap='viridis', 
    show=False,  # So we can customize
    figsize=(8,8)
)

# Extract the heatmap axis
ax = axes_dict['heatmap_ax']

# Compute where to draw separator lines
group_order = malignant.obs['EMT_subtype_finer'].cat.categories
group_sizes = malignant.obs['EMT_subtype_finer'].value_counts().loc[group_order]
group_boundaries = group_sizes.cumsum()[:-1]

for boundary in group_boundaries:
    # Draw a thicker white line to overwrite the black one
    ax.axhline(boundary, color='white', linewidth=5, zorder=10)

plt.tight_layout()
plt.show()

In [None]:
df = malignant.obs[['EMT_score_DL', 'EMT_subtype_finer']].copy()
df = df.sort_values(by='EMT_score_DL').reset_index(drop=True)
df['Cell Index'] = df.index
# Plot
plt.figure(figsize=(10, 5))
sns.scatterplot(
    x='Cell Index',
    y='EMT_score_DL',
    hue='EMT_subtype_finer',
    palette='plasma',
    data=df,
    s=100,
    linewidth=0
)

plt.axhline(0, color='gray', linestyle='--', lw=1)
plt.title("Transition from Classical to Basal across Cells")
plt.ylabel("Classical → Basal Spectrum Score")
plt.xlabel("Cells sorted by Spectrum Score")
plt.legend(title='Subtype', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(visible=True)
plt.tight_layout()
plt.show()

In [None]:
# from scipy.special import softmax
# z_scores = np.vstack([
#     malignant.obs['Basal_score_z'],
#     malignant.obs['Classical_score_z']
# ]).T

# probs = softmax(z_scores, axis=1)
# malignant.obs['Basal_prob'] = probs[:, 0]
# malignant.obs['Classical_prob'] = probs[:, 1]

# labels = np.full(len(probs), 'Unclassified', dtype=object)
# labels[probs[:, 0] > 0.6] = 'Basal'
# labels[probs[:, 1] > 0.6] = 'Classical'
# malignant.obs['PDAC_Subtype'] = labels

# malignant.obs.groupby('PDAC_Subtype').size()

# from sklearn.preprocessing import minmax_scale

# malignant.obs['Basal_score_scaled'] = minmax_scale(malignant.obs['Basal_score_z'])
# malignant.obs['Classical_score_scaled'] = minmax_scale(malignant.obs['Classical_score_z'])
# z_scores = np.vstack([
#     malignant.obs['Basal_score_scaled'],
#     malignant.obs['Classical_score_scaled']
# ]).T

# probs = softmax(z_scores, axis=1)
# malignant.obs['Basal_prob'] = probs[:, 0]
# malignant.obs['Classical_prob'] = probs[:, 1]
# labels = np.full(len(probs), 'Unclassified', dtype=object)
# labels[probs[:, 0] > 0.6] = 'Basal'
# labels[probs[:, 1] > 0.6] = 'Classical'
# malignant.obs['PDAC_Subtype'] = labels
# malignant.obs.groupby('PDAC_Subtype').size()

# z_scores

# malignant.obs.groupby('Level_4').size()

# malignant.obs['Subtype_score_diff'] = malignant.obs['Classical_score_z'] - malignant.obs['Basal_score_z']
# malignant.obs['PDAC_Subtype'] = 'Unclassified'
# malignant.obs.loc[malignant.obs['Subtype_score_diff'] > 0.5, 'PDAC_Subtype'] = 'Classical'
# malignant.obs.loc[malignant.obs['Subtype_score_diff'] < -0.5, 'PDAC_Subtype'] = 'Basal'

# malignant.obs['Subtype_score_diff'] = malignant.obs['Classical_score_z'] - malignant.obs['Basal_score_z']
# malignant.obs['PDAC_Subtype'] = 'Unclassified'
# malignant.obs.loc[malignant.obs['Subtype_score_diff'] > 0.3, 'PDAC_Subtype'] = 'Classical'
# malignant.obs.loc[malignant.obs['Subtype_score_diff'] < -0.3, 'PDAC_Subtype'] = 'Basal'

# malignant.obs.groupby('PDAC_Subtype').size()

# unclassified = malignant[malignant.obs['PDAC_Subtype'] == 'Unclassified']

# unclassified

# unclassified.obs['Malignant_leiden_0.75'] = unclassified.obs['Malignant_leiden_0.75'].astype(str)
# malignant.obs['Malignant_leiden_0.75'] = malignant.obs['Malignant_leiden_0.75'].astype(str)

# sc.pl.heatmap(unclassified, markers_dict, groupby='Malignant_leiden_0.75', layer='log_norm')
# sc.pl.heatmap(malignant, markers_dict, groupby='Malignant_leiden_0.75', layer='log_norm')

# sc.pl.umap(malignant, color=['Malignant_leiden_0.75', 'Classical_score_z', 'Basal_score_z', 'PDAC_Subtype'], size=3)


# malignant.obs.head()

# markers_dict = {'basal_markers': basal_markers_plot, 'classical_markers': classical_markers_plot}

# malignant.obs['Malignant_leiden_0.75'] = malignant.obs['Malignant_leiden_0.75'].astype(str)

# sc.pl.heatmap(malignant, markers_dict, groupby='Malignant_leiden_0.75', layer='log_norm')
# sc.pl.heatmap(malignant, markers_dict, groupby='PDAC_Subtype', layer='log_norm')

# adata.obs['Level_4_refined'] = adata.obs.Level_4.copy()

# adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype(str)
# malignant.obs.Level_4 = malignant.obs.Level_4.astype(str)
# adata.obs.loc[malignant.obs_names, 'Level_4_refined'] = malignant.obs['Level_4']

In [None]:
# Add to main adata
malignant.obs.EMT_subtype = malignant.obs.EMT_subtype.astype(str)
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype(str)
# Ensure the index alignment is preserved
adata.obs.loc[malignant.obs_names, 'Level_4_refined'] = malignant.obs['EMT_subtype'].reindex(malignant.obs_names)
adata.obs['EMT_score_DL'] = np.nan
adata.obs.loc[malignant.obs_names, 'EMT_score_DL'] = malignant.obs['EMT_score_DL'].reindex(malignant.obs_names)

In [None]:
adata.obs['EMT_subtype_finer'] = np.nan
adata.obs.loc[malignant.obs_names, 'EMT_subtype_finer'] = malignant.obs['EMT_subtype_finer'].reindex(malignant.obs_names)

In [None]:
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype('category')

In [None]:
adata.obs['Level_4_refined'] = adata.obs.Level_4_refined.replace('Malignant', 'Malignant - Epithelial')

In [None]:
adata.obs.groupby('Level_4_refined').size()

# Resolve remaining Basal, Classical and Mixed

In [None]:
adata[adata.obs.Level_4_refined.isin(['Mixed', 'Classical', 'Basal'])] #.obs.head()

In [None]:
normal_mask = (
    adata.obs['Condition'].isin(['Normal']) &
    adata.obs['Level_4_refined'].isin(['Basal', 'Classical', 'Mixed'])
)
adata.obs['Suspicious_Normal'] = 'Other'
adata.obs.loc[normal_mask, 'Suspicious_Normal'] = 'PDAC-like in Normal'

In [None]:
adata.obs.groupby('Suspicious_Normal').size()

In [None]:
sc.pl.dotplot(adata, var_names=['CFTR', 'MUC1', 'KRT19', 'SOX9', 'HNF1B'], groupby='Suspicious_Normal', layer='log_norm')

In [None]:
adata.obs.Level_4_refined = adata.obs.Level_4_refined.astype(str)
adata.obs.loc[normal_mask, 'Level_4_refined'] = 'Ductal Cell'

In [None]:
adata.obs['Level_4_refined'] = adata.obs.Level_4_refined.replace('T cell', 'T Cell')

In [None]:
adata.obs.Level_4_refined = adata.obs.Level_4_refined.replace('Ductal', 'Ductal Cell')

In [None]:
adata.obs.groupby('Level_4_refined').size()

In [None]:
sc.pl.umap(adata, color=['Level_4_refined'], groups=['Malignant - Epithelial', 'Malignant - Mesenchymal'], size=5)

In [None]:
len(adata.obs.Level_4_refined.unique())

# Macrophages which express CD3

In [None]:
# Subset to macrophage
t_cell_macro = adata[adata.obs.Level_2.isin(['T Cell','Macrophage',
 'T-reg',
 'CD8+ T cell',
 'CD4+ T cell',
 'T cell',])]

In [None]:
t_cell_macro.obs.Level_2.unique()

In [None]:
# from sklearn_ann.kneighbors.annoy import AnnoyTransformer
# sc.pp.neighbors(adata, transformer=AnnoyTransformer(15), use_rep='scANVI_emb')

In [None]:
sc.pp.neighbors(t_cell_macro, use_rep='scanvi_emb', )
sc.tl.leiden(t_cell_macro, resolution=0.75)
sc.tl.umap(t_cell_macro)

In [None]:
sc.tl.leiden(t_cell_macro, resolution=2, flavor='igraph')

In [None]:
sc.pl.umap(t_cell_macro, color=['Technology', 'leiden', 'Level_2'], ncols=3, size=3,
          legend_loc='on data',
    legend_fontoutline=2, legend_fontsize=5)

In [None]:
sc.pl.umap(t_cell_macro, color=['leiden'], groups=['25'], ncols=3, size=3,
          legend_loc='on data',
    legend_fontoutline=2, legend_fontsize=5)

In [None]:
sc.pl.umap(t_cell_macro, color=['CD3D', 'CD68'], ncols=2, size=3, layer='log_norm')

In [None]:
mask = adata.obs_names.isin(t_cell_macro[ t_cell_macro.obs['leiden'] == '25' ].obs_names)

# Convert to str to avoid Categorical errors
adata.obs['Level_4_refined'] = adata.obs['Level_4_refined'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_4_refined'] = 'Macrophage - CD3+ TAM'

In [None]:
adata.write('2025_05_20_refined_annotation.h5ad')

# UMAP

In [None]:
plt.rcParams['figure.figsize'] = (10,10)
sc.pl.umap(adata, color='Level_4_refined', legend_loc='on data', legend_fontoutline=2, legend_fontsize=6)

# Reload

In [None]:
pwd

In [None]:
adata = sc.read_h5ad('2025_05_09_refined_annotation.h5ad')

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['Level_4', 'Level_4_refined'], ncols=1, legend_fontsize=6)

# Malignant epithelial

In [None]:
malignant = adata[adata.obs['Level_4_refined'].isin(['Malignant - Epithelial', 'Malignant - Mesenchymal'])] #.copy()

In [None]:
malignant

In [None]:
me_markers = {
    "Malignant Cell - Epithelial": ["EPCAM", "CLDN4", "CLDN7"],
    "Malignant Cell - Pit Like": ["GKN1", "GKN2", "CLDN18"],
    "Malignant Cell - Hypoxia": ["HIF1A", "VEGFA", "CA9"],
    "Malignant Cell - Highly Proliferative": ["MKI67", "CENPF", "TOP2A"],
    "Malignant Cell - EMT": ["ZEB1", "TWIST1", "CDH2"],
    "Malignant Cell - Acinar-like": ["REG3A", "REG3G", "CPA1"],
    "Malignant Cell - Invasive": ["MMP9", "MMP2", "MMP14"],
    "Malignant Cell - Senescence": ["CDKN1A", "CDKN2A", "LMNA"],
    "Malignant Cell - Apoptotic": ["BAX", "BCL2", "FAS"],
    "Malignant Cell - Mesenchymal": ["THY1", "COL3A1", 'FN1'],

}

In [None]:
for cell_type, markers in me_markers.items():
    sc.tl.score_genes(malignant, gene_list=markers, score_name=cell_type)
celltypes = []
scores = malignant.obs[list(me_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(me_markers.keys()))[max_indices]

threshold = 0.0
#np.mean(scores) * np.std(scores) * 50
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

malignant.obs['celltype'] = celltypes


In [None]:
malignant.obs['celltype'].value_counts()

In [None]:
# malignant.obs['celltype'].replace({'Malignant Cell - Proliferative': 'Malignant Cell - Highly Proliferative'}, inplace=True)
sc.pl.dotplot(malignant,  groupby = 'celltype', var_names = list(me_markers.keys()), vmax=1, vmin=0, layer='log_norm', standard_scale='var')
sc.pl.dotplot(malignant,  groupby = 'celltype', var_names = me_markers, vmax=1, vmin=0, layer='log_norm', standard_scale='var')

In [None]:
malignant[malignant.obs.celltype == 'Missclassified'].obs.head()

# Resolve Malignant Missclassified with a knn classifier

In [None]:
sc.pl.umap(malignant, color='celltype')

In [None]:
sc.pp.neighbors(malignant, use_rep='scanvi_emb')
sc.tl.umap(malignant)
sc.tl.leiden(malignant)

In [None]:
malignant.n_obs

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Use cells with confident annotations
confident_mask = malignant.obs['celltype'] != "Missclassified"
X_train = malignant.obsm["scanvi_emb"][confident_mask]
y_train = malignant.obs['celltype'][confident_mask].values

# Use misclassified cells for prediction
X_test_mask = ~confident_mask
X_test = malignant.obsm["scanvi_emb"][X_test_mask]

# Fit kNN classifier
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)

# Predict labels and probabilities
predicted_labels = knn.predict(X_test)
probs = knn.predict_proba(X_test)
max_confidences = probs.max(axis=1)  # Confidence = highest predicted class probability

# Store results
malignant.obs['celltype_knn'] = malignant.obs['celltype'].copy()
malignant.obs.loc[X_test_mask, 'celltype_knn'] = predicted_labels
malignant.obs.loc[X_test_mask, 'knn_confidence'] = max_confidences

In [None]:
malignant[malignant.obs['celltype'] == "Missclassified"].obs.sort_values(by='knn_confidence', ascending=False).head()

In [None]:
malignant.obs.celltype_knn.value_counts()

In [None]:
malignant[malignant.obs['celltype'] == "Missclassified"].obs.celltype_knn.value_counts()

In [None]:
sc.pl.umap(malignant, color=['celltype', 'celltype_knn'], ncols=1, size=2)

In [None]:
malignant[malignant.obs['celltype'] == "Missclassified"].obs.head()

In [None]:
# import seaborn as sns

# confidence_threshold = 0.3

# malignant.obs['celltype_knn_final'] = malignant.obs['celltype_knn']
# malignant.obs['celltype_knn_final'] = malignant.obs['celltype_knn_final'].astype(str)
# malignant.obs.loc[(malignant.obs['knn_confidence'] < confidence_threshold) & (malignant.obs['celltype'] == "Missclassified"), 'celltype_knn_final'] = "LowConfidence"

# malignant.obs.celltype_knn_final.value_counts()

# malignant.obs.celltype_knn = malignant.obs.celltype_knn.astype(str)

In [None]:
sorted_me_markers = dict(sorted(me_markers.items()))

In [None]:
malignant.obs.celltype_knn = malignant.obs.celltype_knn.astype(str)

In [None]:
sc.pl.dotplot(malignant, groupby='celltype_knn', var_names = sorted_me_markers, dendrogram=False, layer='log_norm', standard_scale='var',
             categories_order=sorted(malignant.obs.celltype_knn.unique()))

In [None]:
malignant.obs.groupby(['celltype', 'celltype_knn']).size().unstack()

In [None]:
adata.obs['Level_5'] = adata.obs['Level_4_refined'].copy()

In [None]:
mask = malignant.obs_names

# Convert to str to avoid Categorical errors
adata.obs['Level_5'] = adata.obs['Level_5'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_5'] = malignant.obs['celltype_knn'].reindex(mask)

In [None]:
plt.rcParams['figure.figsize'] = (10,10)
sc.pl.umap(adata, color='Level_5', legend_loc='on data', legend_fontoutline=2, legend_fontsize=6)

In [None]:
adata.write('2025_05_20_refined_annotation.h5ad')

In [None]:
sc.pl.umap(malignant, color='celltype_knn')

In [None]:
# see if Malignant_classes are resolved by leiden clusters
malignant.obs.groupby(['leiden', 'celltype_knn']).size().unstack()

# Reload for T cells

In [None]:
adata = sc.read_h5ad('2025_05_09_refined_annotation.h5ad')

In [None]:
adata

# Resolve T Cells

In [None]:
# cd8_subsets = {
#    "CD8+ Effector T Cell": ["GZMB", "GZMK", "PRF1", "IFNG"],
#    "CD8+ Exhausted T Cell": ["PDCD1", "HAVCR2", "LAG3", "TOX"],
#    "CD8+ Memory T Cell": ["CCR7", "CD27", "SELL"],
#    "CD8+ Naive T Cell": ["SELL", "CCR7", "LEF1"],
#    "CD8+ Terminal Effector T Cell": ["ZEB2", "GZMB", "IFNG", "TBX21"],
#    "CD8+ Tissue-Resident Memory T Cell": ["CD69", "ITGAE", "RUNX3", "CXCR6"]}

# cd4_subsets = {
#    "CD4+ Th1 Cell": ["STAT4", "CXCR3", "IFNG"],
#    "CD4+ Th2 Cell": ["GATA3", "CCR4", "PTGDR2"],
#    "CD4+ Th17 Cell": ["IL17A", "IL17F", "RORC", "KLRB1", "CCR6"],
#    "CD4+ Th22 Cell": ["IL22", "CCR10", "FOXO4"],
#    "CD4+ Naive Cell": ["CCR7", "SELL", "LEF1", "TCF7"],
#    "CD4+ Central Memory T Cell": ["IL7R", "GPR183", "CD69",],  # "IL7R", "TCF7"
#    "γδ T Cell (Vδ1)": ["TRDC"],
#    "T-reg": ["FOXP3", "IL2RA", "CTLA4", "TNFRSF18"], #"IKZF2", 
#    "Double Positive CD4+CD8+ T Cell": ["CD4", "CD8A", "CD8B"]}

In [None]:
adata.obs.Level_2.unique().tolist()

In [None]:
t_all = adata[(adata.obs.Level_2.isin(['T-reg', 'CD8+ T cell', 'CD4+ T cell', 'T cell'])) | (adata.obs.Level_5 == 'T Cell')]

In [None]:
t_all

In [None]:
t_all.obs.Level_2 = t_all.obs.Level_2.replace('Ambiguous_Myelodi', 'T cell')

In [None]:
sc.pl.dotplot(t_all, groupby='Level_2', var_names=['CD4', 'CD8A'], layer='log_norm', standard_scale='var')

In [None]:
# cd4_t = adata[adata.obs.Level_2.str.contains('CD4')]

In [None]:
cd4_t = adata[adata.obs.Level_2.isin(['CD4+ T cell', 'T-reg'])].copy()

In [None]:
cd4_t

In [None]:
cd4_t.obs.Level_2.unique()

In [None]:
cd8 = adata[adata.obs.Level_2.isin(['CD8+ T cell', 'T cell'])  | (adata.obs.Level_5 == 'T Cell')].copy()

In [None]:
cd8

In [None]:
cd8.obs.Level_2.unique()

# Change the markers and redo
- remove universally expressed: CD69, IL7R and RUNX3

In [None]:
cd8_subsets = {
   "CD8+ Effector T Cell": ["GZMB", "GZMK", "PRF1", "IFNG"],
   "CD8+ Exhausted T Cell": ["PDCD1", "HAVCR2", "LAG3", "TOX"],
   "CD8+ Memory T Cell": ["CCR7", "CD27", "SELL"],
   "CD8+ Naive T Cell": ["SELL", "CCR7", "LEF1"],
   "CD8+ Terminal Effector T Cell": ["ZEB2", "GZMB", "IFNG", "TBX21"],
   "CD8+ Tissue-Resident Memory T Cell": ["ITGAE", "CXCR6", "ZNF683"]}

In [None]:
cd4_subsets = {
   "CD4+ Th1 Cell": ["STAT4", "CXCR3", "IFNG"],
   "CD4+ Th2 Cell": ["GATA3", "CCR4", "PTGDR2"],
   "CD4+ Th17 Cell": ["IL17A", "IL17F", "RORC", "KLRB1", "CCR6"],
   "CD4+ Th22 Cell": ["IL22", "CCR10", "FOXO4"],
   "CD4+ Naive Cell": ["CCR7", "SELL", "LEF1", "TCF7"],
   "CD4+ Central Memory T Cell": ["GPR183", "TCF7", "SELL",],  # "IL7R", "TCF7"
   "γδ T Cell (Vδ1)": ["TRDC"],
   "T-reg": ["FOXP3", "IL2RA", "CTLA4", "TNFRSF18"], #"IKZF2", 
   "Double Positive CD4+CD8+ T Cell": ["CD4", "CD8A", "CD8B"]}

In [None]:
for cell_type, markers in cd4_subsets.items():
    sc.tl.score_genes(cd4_t, gene_list=markers, score_name=cell_type)
celltypes = []
scores = cd4_t.obs[list(cd4_subsets.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(cd4_subsets.keys()))[max_indices]

threshold = 0.0
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

cd4_t.obs['celltype'] = celltypes

In [None]:
for cell_type, markers in cd8_subsets.items():
    sc.tl.score_genes(cd8, gene_list=markers, score_name=cell_type)
celltypes = []
scores = cd8.obs[list(cd8_subsets.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(cd8_subsets.keys()))[max_indices]

threshold = 0.0
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

cd8.obs['celltype'] = celltypes

In [None]:
# cd4_t.obs['celltype'].replace('Missc', 'Missclassified', inplace=True)
sc.pl.dotplot(cd4_t, groupby = 'celltype', var_names = cd4_subsets, layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd4_t, groupby = 'celltype', var_names = ['CD3E', 'CD4', 'CD8A', 'CD68'], layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd4_t, groupby = 'celltype', var_names = list(cd4_subsets.keys()), layer='log_norm', standard_scale='var')

In [None]:
# cd4_t.obs['celltype'].replace('Missc', 'Missclassified', inplace=True)
sc.pl.dotplot(cd8, groupby = 'celltype', var_names = cd8_subsets, layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd8, groupby = 'celltype', var_names = ['CD4', 'CD8A'], layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd8, groupby = 'celltype', var_names = list(cd8_subsets.keys()), layer='log_norm', standard_scale='var')

In [None]:
print(cd8.obs['celltype'].value_counts())
cd4_t.obs['celltype'].value_counts()

# Resolve misclassified with KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

for t_subset in [cd4_t, cd8]:
    # Use cells with confident annotations
    confident_mask = t_subset.obs['celltype'] != "Missclassified"
    X_train = t_subset.obsm["scanvi_emb"][confident_mask]
    y_train = t_subset.obs['celltype'][confident_mask].values
    
    # Use misclassified cells for prediction
    X_test_mask = ~confident_mask
    X_test = t_subset.obsm["scanvi_emb"][X_test_mask]
    
    # Fit kNN classifier
    knn = KNeighborsClassifier(n_neighbors=25)
    knn.fit(X_train, y_train)
    
    # Predict labels and probabilities
    predicted_labels = knn.predict(X_test)
    probs = knn.predict_proba(X_test)
    max_confidences = probs.max(axis=1)  # Confidence = highest predicted class probability
    
    # Store results
    t_subset.obs['celltype_knn'] = t_subset.obs['celltype'].copy()
    t_subset.obs.loc[X_test_mask, 'celltype_knn'] = predicted_labels
    t_subset.obs.loc[X_test_mask, 'knn_confidence'] = max_confidences

In [None]:
for t_subset in [cd4_t, cd8]:
    print(t_subset.obs.celltype_knn.value_counts())
    print('Misclassified Classified as ....')
    print(t_subset[t_subset.obs['celltype'] == "Missclassified"].obs.celltype_knn.value_counts())

In [None]:
cd8.obs.celltype_knn = cd8.obs.celltype_knn.astype(str)
cd4_t.obs.celltype_knn = cd4_t.obs.celltype_knn.astype(str)

In [None]:
# cd4_t.obs['celltype'].replace('Missc', 'Missclassified', inplace=True)
sc.pl.dotplot(cd4_t, groupby = 'celltype_knn', var_names = cd4_subsets, layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd4_t, groupby = 'celltype_knn', var_names = ['CD3E', 'CD4', 'CD8A', 'CD68'], layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd4_t, groupby = 'celltype_knn', var_names = list(cd4_subsets.keys()), layer='log_norm', standard_scale='var')

In [None]:
# cd4_t.obs['celltype'].replace('Missc', 'Missclassified', inplace=True)
sc.pl.dotplot(cd8, groupby = 'celltype_knn', var_names = cd8_subsets, layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd8, groupby = 'celltype_knn', var_names = ['CD4', 'CD8A'], layer='log_norm', standard_scale='var')
sc.pl.dotplot(cd8, groupby = 'celltype_knn', var_names = list(cd8_subsets.keys()), layer='log_norm', standard_scale='var')

In [None]:
mask = cd8.obs_names

# Convert to str to avoid Categorical errors
adata.obs['Level_5'] = adata.obs['Level_5'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_5'] = cd8.obs['celltype_knn'].reindex(mask)

In [None]:
mask = cd4_t.obs_names

# Convert to str to avoid Categorical errors
adata.obs['Level_5'] = adata.obs['Level_5'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_5'] = cd4_t.obs['celltype_knn'].reindex(mask)

In [None]:
adata.obs.Level_5.value_counts()

In [None]:
adata.write('2025_05_20_refined_annotation.h5ad')

# Endothelial

In [None]:
adata.obs.groupby(['Level_1', 'Level_5']).size().unstack().style.set_sticky('index')

In [None]:
281 + 13572 + 21709 + 2676

In [None]:
endothelial = adata[adata.obs.Level_5.str.contains('Endothelial')]

In [None]:
endothelial

In [None]:
endothelial.obs.Level_5.unique().tolist()

In [None]:
endothelial_markers = {
    "Tumor-Associated Endothelial Cell": ["DDIT4", "TIE1", "SEMA6B", "PLCB1", "LYZ"],
    "Vascular Endothelial Cell": ["PECAM1", "CDH5", "PLVAP", "EHD4", "CLEC14A"],
    "Lymphatic Endothelial Cell": ["PROX1", "PDPN", "LYVE1", "FLT4"]
}

In [None]:
for cell_type, markers in endothelial_markers.items():
    sc.tl.score_genes(endothelial, gene_list=markers, score_name=cell_type)
celltypes = []
scores = endothelial.obs[list(endothelial_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(endothelial_markers.keys()))[max_indices]

threshold = 0.0
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

endothelial.obs['celltype'] = celltypes

In [None]:
endothelial.obs['celltype'].value_counts()

In [None]:
sc.pl.dotplot(endothelial, groupby = 'celltype', var_names = endothelial_markers, layer='log_norm', standard_scale='var')
sc.pl.dotplot(endothelial, groupby = 'celltype', var_names = list(endothelial_markers.keys()), layer='log_norm', standard_scale='var')

In [None]:
# Use cells with confident annotations
confident_mask = endothelial.obs['celltype'] != "Missclassified"
X_train = endothelial.obsm["scanvi_emb"][confident_mask]
y_train = endothelial.obs['celltype'][confident_mask].values

# Use misclassified cells for prediction
X_test_mask = ~confident_mask
X_test = endothelial.obsm["scanvi_emb"][X_test_mask]

# Fit kNN classifier
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)

# Predict labels and probabilities
predicted_labels = knn.predict(X_test)
probs = knn.predict_proba(X_test)
max_confidences = probs.max(axis=1)  # Confidence = highest predicted class probability

# Store results
endothelial.obs['celltype_knn'] = endothelial.obs['celltype'].copy()
endothelial.obs.loc[X_test_mask, 'celltype_knn'] = predicted_labels
endothelial.obs.loc[X_test_mask, 'knn_confidence'] = max_confidences

In [None]:
print(endothelial.obs.celltype_knn.value_counts())
print('Misclassified Classified as ....')
print(endothelial[endothelial.obs['celltype'] == "Missclassified"].obs.celltype_knn.value_counts())

In [None]:
endothelial.obs.celltype_knn = endothelial.obs.celltype_knn.astype(str)

In [None]:
sc.pl.dotplot(endothelial, groupby = 'celltype_knn', var_names = endothelial_markers, layer='log_norm', standard_scale='var')
sc.pl.dotplot(endothelial, groupby = 'celltype_knn', var_names = list(endothelial_markers.keys()), layer='log_norm', standard_scale='var')

In [None]:
sc.pp.neighbors(endothelial, use_rep='scanvi_emb')
sc.tl.umap(endothelial)
sc.tl.leiden(endothelial)

In [None]:
plt.rcParams['figure.figsize'] = (4,4)
sc.pl.umap(endothelial, color=['celltype_knn', 'leiden'], size=4, wspace=0.6)

In [None]:
mask = endothelial.obs_names

# Convert to str to avoid Categorical errors
adata.obs['Level_5'] = adata.obs['Level_5'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_5'] = endothelial.obs['celltype_knn'].reindex(mask)

# Replace names for M1-like and M2-like

In [None]:
adata.obs.Level_5 = adata.obs.Level_5.replace('M1 TAM', 'M1-like TAM')
adata.obs.Level_5 = adata.obs.Level_5.replace('M2 TAM', 'M2-like TAM')

# B cell

In [None]:
b_cells = adata[adata.obs.Level_2.isin(['B cell', 'Plasma cell'])]

In [None]:
b_cells

In [None]:
b_cells.obs.Level_2.value_counts()

In [None]:
b_markers = {
    "B Cell - Naive": ["IL7R", "IGHM", "TCL1A", "CD19"],
    "B Cell - Activated": ["IGHM", "CD69", "CD86"],
    "B Cell - Memory": ["CD27", "IGHE", "IGHA1"],
    "B-reg": ["TFRC", "CD44", "TGFB1"],
    "Plasma Cell": ["MZB1", "XBP1", "PRDM1", "SDC1"],
    "Plasmablast": ["CD27", "CD38", "PRDM1", "IGHG1", "MKI67"],
    "B Cell - Germinal Center": ["AICDA", "BCL6", "RGS13", "S1PR2"]
}

In [None]:
for cell_type, markers in b_markers.items():
    sc.tl.score_genes(b_cells, gene_list=markers, score_name=cell_type)
celltypes = []
scores = b_cells.obs[list(b_markers.keys())].values

max_indices = np.argmax(scores, axis=1)
celltypes = np.array(list(b_markers.keys()))[max_indices]

threshold = 0.0
max_scores = scores[np.arange(scores.shape[0]), max_indices]
celltypes[max_scores < threshold] = "Missclassified"

b_cells.obs['celltype'] = celltypes

In [None]:
b_cells.obs['celltype'].value_counts()

In [None]:
sc.pl.dotplot(b_cells, groupby = 'celltype', var_names = b_markers, layer='log_norm', standard_scale='var')
sc.pl.dotplot(b_cells, groupby = 'celltype', var_names = list(b_markers.keys()), layer='log_norm', standard_scale='var')

In [None]:
# Use cells with confident annotations
confident_mask = b_cells.obs['celltype'] != "Missclassified"
X_train = b_cells.obsm["scanvi_emb"][confident_mask]
y_train = b_cells.obs['celltype'][confident_mask].values

# Use misclassified cells for prediction
X_test_mask = ~confident_mask
X_test = b_cells.obsm["scanvi_emb"][X_test_mask]

# Fit kNN classifier
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)

# Predict labels and probabilities
predicted_labels = knn.predict(X_test)
probs = knn.predict_proba(X_test)
max_confidences = probs.max(axis=1)  # Confidence = highest predicted class probability

# Store results
b_cells.obs['celltype_knn'] = b_cells.obs['celltype'].copy()
b_cells.obs.loc[X_test_mask, 'celltype_knn'] = predicted_labels
b_cells.obs.loc[X_test_mask, 'knn_confidence'] = max_confidences

In [None]:
print(b_cells.obs.celltype_knn.value_counts())
print('Misclassified Classified as ....')
print(b_cells[b_cells.obs['celltype'] == "Missclassified"].obs.celltype_knn.value_counts())

In [None]:
b_cells.obs.celltype_knn = b_cells.obs.celltype_knn.astype(str)

In [None]:
sc.pl.dotplot(b_cells, groupby = 'celltype_knn', var_names = b_markers, layer='log_norm', standard_scale='var')
sc.pl.dotplot(b_cells, groupby = 'celltype_knn', var_names = list(b_markers.keys()), layer='log_norm', standard_scale='var')

In [None]:
mask = b_cells.obs_names

# Convert to str to avoid Categorical errors
adata.obs['Level_5'] = adata.obs['Level_5'].astype(str)

# Assign the new label
adata.obs.loc[mask, 'Level_5'] = b_cells.obs['celltype_knn'].reindex(mask)

# Final UMAP

In [None]:
plt.rcParams['figure.figsize'] = (8,8)
sc.pl.umap(adata, color='Level_5', size=5)

In [None]:
plt.rcParams['figure.figsize'] = (12,12)
sc.pl.umap(adata, color='Level_5', size=2, legend_loc='on data', legend_fontsize=5, legend_fontoutline=3)

In [None]:
len(adata.obs.Level_5.value_counts())

In [None]:
adata.obs.Level_5.value_counts()

# Save

In [None]:
adata.write('2025_05_20_refined_annotation.h5ad')

In [None]:
adata

In [None]:
pwd

In [None]:
round(916312/10000)