# Markers

In [None]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [None]:
adata_subset = adata[adata.obs.Condition.isin(['Primary Tumour', 'Healthy'])]
adata_subset = adata_subset[adata_subset.obs.Level_2.isin(['Malignant Cell', 'Exocrine Cell'])] #.obs.Condition.value_counts()

In [None]:
adata_deseq = adata_subset[((adata_subset.obs.Condition == 'Healthy') & (adata_subset.obs.Level_3 == 'Ductal Cell') |
    ((adata_subset.obs.Condition == 'Primary Tumour') & (adata_subset.obs.Level_3.str.contains('Mal'))))] 

In [None]:
adata_deseq.obs.groupby(['Condition', 'Level_3']).size().unstack()

In [None]:
sc.pl.umap(adata_subset, color=['Condition', 'Level_3'])
sc.pl.umap(adata_subset[adata_subset.obs.Level_3 == 'Ductal Cell'], color=['Condition', 'Level_3'])
sc.pl.umap(adata_deseq, color=['Condition', 'Level_3'])

In [None]:
sc.pp.filter_genes(adata_deseq, min_cells=100)

In [None]:
adata_deseq.obs.groupby(['Dataset', 'Level_3']).size().unstack()

In [None]:
adata_deseq.obs.Level_2.value_counts()

In [None]:
adata_deseq.obs.groupby('Condition')['Dataset_ID'].nunique()

In [None]:
adata_deseq.X.max()

In [None]:
adata_deseq_df = adata_deseq.to_df()

In [None]:
adata_deseq_df['Dataset_ID'] = adata_deseq_df.index.map(dict(zip(adata_subset.obs_names, adata_subset.obs.Dataset_ID)))                                                                
adata_deseq_df.head()

In [None]:
gene_cols = adata_deseq_df.columns.drop("Dataset_ID")
pb_mean = adata_deseq_df.groupby("Dataset_ID")[gene_cols].sum()

In [None]:
pb_mean['Dataset_ID'] = pb_mean.index.map(dict(zip(adata_subset.obs.Dataset_ID, adata_subset.obs.Dataset_ID)))                                                                
pb_mean['Condition'] = pb_mean.index.map(dict(zip(adata_subset.obs.Dataset_ID, adata_subset.obs.Condition)))                                                                

In [None]:
pb_mean.head()

In [None]:
# del adata_subset_df
# import gc
# gc.collect()

In [None]:
metadata = pb_mean[['Dataset_ID', 'Condition']]
counts = pb_mean.iloc[:, :-2]

In [None]:
metadata['Technology'] = metadata.index.map(dict(zip(adata_subset.obs.Dataset_ID, adata_subset.obs.Technology)))

In [None]:
metadata

In [None]:
# 1. Initialize DESeq2 dataset
dds = DeseqDataSet(
    counts=counts,
    metadata=metadata,
    design_factors=["Condition", "Technology"],
    refit_cooks=True,
    n_cpus=-1
)

# 2. Run DESeq2
dds.deseq2()

In [None]:
# 3. Get results for Primary Tumour vs Healthy
stat_res = DeseqStats(dds, contrast=["Condition", "Primary Tumour", "Healthy"])
stat_res.summary()

In [None]:
# 4. Extract results as DataFrame
res_df = stat_res.results_df #.sort_values(by='log2FoldChange', ascending=False)

In [None]:
res_df #.columns

In [None]:
import numpy as np
eps = np.finfo(float).tiny
res = res_df.dropna().copy()
res = res.query("padj < 0.05 and baseMean >= 10")
res['padj_safe'] = res['padj'].clip(lower=eps)
res['score'] = res['log2FoldChange'].abs() * (-np.log10(res['padj_safe'])) * np.log10(1 + res['baseMean'])
top = res.sort_values('score', ascending=False)

In [None]:
top.to_csv('Biomarkers/pydeseq2_dge_psedobulk.csv')

In [None]:
top.head(20)

In [None]:
top_200_deseq = top.head(200).index.tolist()
top_500_deseq = top.head(500).index.tolist()
top_1000_deseq = top.head(1000).index.tolist()

In [None]:
top[(top.index.str.contains("CLDN18"))].head(50)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_200_deseq[:50], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_200_deseq[50:100], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_200_deseq[100:150], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_200_deseq[150:200], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[200:250], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[250:300], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[300:350], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[350:400], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[400:450], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_500_deseq[450:500], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[500:550], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[550:600], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[600:650], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[650:700], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[700:750], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[750:800], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[800:850], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[850:900], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[900:950], layer='log_norm', standard_scale='var', swap_axes=False)
sc.pl.dotplot(adata, groupby='Level_2', var_names=top_1000_deseq[950:], layer='log_norm', standard_scale='var', swap_axes=False)

In [None]:
top[(top.index.str.contains("MGST1"))].head(50)

In [None]:
top[(top.index.str.contains("SH3"))].head(50)

In [None]:
top[(top.index.str.contains("AMOTL2"))] #.head(50)

In [None]:
interesting_genes = ['CEACAM6', 'S100P', 'GPRC5A', 'FXYD3', 'ATP1A1', 'TFF3', 'TFF1', 'C19orf33', 'IFI27', 
                     'CLU', 'PLCG2', 'PSCA', 'CEACAM5', 'CD55', 'C15orf48', 'FXYD5', 'TFF2', 'AREG', 'LAMB3', 
                     'AHNAK2', 'TMPRSS4', 'SLC16A3', 'NQO1', 'LAMC2', 'MUC3A', 'TRIM29', 'MIA', 'PLAT', 'TSPAN1',
                     'SH3YL1', 'PKM', 'KCNK1', 'TMC5', 'SFN', 'GABRP', 'GCNT3', 'MALL', 'MGST1', "KLF5","SLPI","GPX2","KLK10","MET","GDF15","LEMD1","HK2",
                    "MMP28","MISP","TCN1","SLC2A1","EGLN3","MSLN","CDCP1", "SDR16C5", "PLAC8", "MTMR11", "MBOAT2", "AOC1", "PLEC", "SYT8", "TRIM31", "GALNT5", "PITX1", 
                     "CTSE", "AMOTL2", "LY6D", "TMEM45B", "CCND1", "ITGA2", "CORO2A", "PI3", "LAMA3"]

In [None]:
# interesting_genes = ['RFLNA', 'TNFRSF6B', 'MUCL3', 'CEACAM5', x, 'PADI1', 'HOXC8', 'IQANK1', 'CRYBG2', 'PHGR1', 'PSCA', 'TFF3', 
#                      'MIA', 'FXYD3', 'NECTIN4', 'CLN3', 'ZG16B', 'C15orf48', 'TRIM29', 'GPRC5A', 'GABRP', 'KLK10', 'TMPRSS4', 'LEMD1', 'GCNT3', 'DUOX2', 
#                      'KLK8', 'TRIM31', 'IFI27', 'LAMB3', 'AOC1', 'TCN1', 'AHNAK2', 'FXYD5', 'MAIP1']

In [None]:
len(interesting_genes)

In [None]:
sc.pl.dotplot(adata, groupby='Level_4', var_names=interesting_genes, layer='log_norm', standard_scale='var', swap_axes=True)

In [None]:
interesting_genes_subset = [
    "GCNT3","MGST1","KCNK1","C19orf33",
   "MALL","TMC5","SFN","PKM", "PLCG2",  "SH3YL1","ATP1A1","C15orf48","SDCBP2","ABHD17C", 
    'AMOTL2', 'CORO2A', 'AOC1', 'PI3']

interesting_genes_no_reported = ['PLCG2', 'SH3YL1', 'ATP1A1', 'C15orf48', "SDCBP2","ABHD17C", 'AMOTL2', 'CORO2A', 'AOC1', 'PI3']

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=interesting_genes_subset, layer='log_norm', standard_scale='var', swap_axes=True)
sc.pl.dotplot(adata, groupby='Level_3', var_names=interesting_genes_subset, layer='log_norm', standard_scale='var', swap_axes=True)
sc.pl.dotplot(adata, groupby='Level_4', var_names=interesting_genes_subset, layer='log_norm', standard_scale='var', swap_axes=True)

In [None]:
interesting_genes_no_reported = ['SDCBP2','ABHD17C', 'CORO2A', 'AOC1', 'PI3'] #'AMOTL2' negative l2fc
# no or weak signal: 'PLCG2','SH3YL1','ATP1A1','C15orf48'

In [None]:
sc.set_figure_params(dpi_save=300)

In [None]:
sc.pl.matrixplot(adata, var_names=interesting_genes_no_reported, layer='log_norm', groupby='Level_2', swap_axes=True, save='matrix_L2.png')
sc.pl.matrixplot(adata, var_names=interesting_genes_no_reported, layer='log_norm', groupby='Level_3', swap_axes=True, save='matrix_L3.png')
sc.pl.matrixplot(adata, var_names=interesting_genes_no_reported, layer='log_norm', groupby='Level_4', swap_axes=True, save='matrix_L4.png')

In [None]:
sc.pl.dotplot(adata, groupby='Level_2', var_names=interesting_genes_no_reported, layer='log_norm', standard_scale='var', swap_axes=True, save='dotplot_L2.png')
sc.pl.dotplot(adata, groupby='Level_3', var_names=interesting_genes_no_reported, layer='log_norm', standard_scale='var', swap_axes=True, save='dotplot_L3.png') 
sc.pl.dotplot(adata, groupby='Level_4', var_names=interesting_genes_no_reported, layer='log_norm', standard_scale='var', swap_axes=True, save='dotplot_L4.png')

In [None]:
# --- config ---
genes_of_interest = ['SDCBP2','ABHD17C','CORO2A','AOC1','PI3']
p_thr   = 0.05
lfc_thr = 1.0

# If you have shrunken LFC available, list its column name first:
LFC_CANDIDATES = ["lfc_shrunk", "log2FoldChange_shrunk", "log2FoldChange"]

df = res_df.copy()

# Ensure gene names are the index if needed:
# df = df.set_index("gene")

# pick LFC column
for col in LFC_CANDIDATES:
    if col in df.columns:
        LFC_COL = col
        break

# basic cleaning
df = df.dropna(subset=[LFC_COL, "pvalue"]).copy()

# floor tiny p-values to avoid infinite -log10
P_FLOOR = 1e-50
df["p_safe"] = df["pvalue"].clip(lower=P_FLOOR)
df["neglog10p"] = -np.log10(df["p_safe"])

# optional: filter ultra-low abundance (helps visuals)
if "baseMean" in df.columns:
    df = df[df["baseMean"] >= 10].copy()

# optional: clip extreme LFCs if using unshrunk LFC
if LFC_COL == "log2FoldChange":
    lo, hi = np.percentile(df[LFC_COL].values, [0.5, 99.5])
    df[LFC_COL] = df[LFC_COL].clip(lo, hi)

# flags
df["is_sig"]      = (df["pvalue"] < p_thr) & (df[LFC_COL].abs() >= lfc_thr)
df["is_interest"] = df.index.isin(genes_of_interest)

# plot
plt.figure(figsize=(7.2,6))
# background
plt.scatter(df[LFC_COL], df["neglog10p"], s=8, alpha=0.25, linewidths=0, label="All genes")
# significant
sig = df[df["is_sig"] & ~df["is_interest"]]
plt.scatter(sig[LFC_COL], sig["neglog10p"], s=10, alpha=0.7, linewidths=0, label="Significant")
# genes of interest
goi = df[df["is_interest"]]
plt.scatter(goi[LFC_COL], goi["neglog10p"], s=42, edgecolor="black", linewidths=0.6, label="Selected")

# guides
plt.axvline(+lfc_thr, linestyle="--", linewidth=0.8, color="k")
plt.axvline(-lfc_thr, linestyle="--", linewidth=0.8, color="k")
plt.axhline(-np.log10(p_thr), linestyle="--", linewidth=0.8, color="k")

# labels for your genes
for g, r in goi.iterrows():
    plt.text(r[LFC_COL], r["neglog10p"]+0.08, g, ha="center", va="bottom", fontsize=9)
plt.grid(visible=False)
plt.xlabel(f"{LFC_COL} (log2 fold change)")
plt.ylabel("-log10(p value)")
plt.title("Volcano: −log10(p) vs log2FC (highlighting selected genes)")
plt.legend(frameon=False, loc="upper right")
plt.tight_layout()
plt.show()
plt.savefig('Biomarkers/volcano_plot.png', dpi=300)

# remove grid, and put legend outside, remove names, dpi 300
# send csv of DGE

In [None]:
plt.figure(figsize=(10,8))

# background
plt.scatter(df[LFC_COL], df["neglog10p"], s=8, alpha=0.25, linewidths=0, label="All genes")
# significant
sig = df[df["is_sig"] & ~df["is_interest"]]
plt.scatter(sig[LFC_COL], sig["neglog10p"], s=10, alpha=0.7, linewidths=0, label="Significant")
# genes of interest (no labels)
goi = df[df["is_interest"]]
plt.scatter(goi[LFC_COL], goi["neglog10p"], s=42, edgecolor="black", linewidths=0.6, label="Selected")

# guides
plt.axvline(+lfc_thr, linestyle="--", linewidth=0.8, color="k")
plt.axvline(-lfc_thr, linestyle="--", linewidth=0.8, color="k")
plt.axhline(-np.log10(p_thr), linestyle="--", linewidth=0.8, color="k")

# REMOVE gene labels: (delete/comment out the loop)
# for g, r in goi.iterrows():
#     plt.text(r[LFC_COL], r["neglog10p"]+0.08, g, ha="center", va="bottom", fontsize=9)

# no grid
plt.grid(False)

plt.xlabel(f"{LFC_COL} (log2 fold change)")
plt.ylabel("-log10(p value)")
plt.title("Volcano: −log10(p) vs log2FC (highlighting selected genes)")

# legend outside plot
plt.legend(frameon=False, loc="upper left", bbox_to_anchor=(1.02, 1.0), borderaxespad=0)

plt.tight_layout()
# save at 300 dpi (tight to include outside legend)
plt.savefig('Biomarkers/volcano_plot.png', dpi=300, bbox_inches='tight')
# (optional) show after saving
plt.show()