In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('/mnt/storage/Shrey/PDAC_Downstream/')

In [None]:
from atlas_plots import AtlasPlotting

In [None]:
figgen = AtlasPlotting("../config.yml",output_dir="Suppl_1/")

In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
adata = ad.read_zarr('/mnt/ssd/atlases/Human_Atlas_Harmonised.zarr/')

In [None]:
adata

# Define Markers and Score

In [None]:
all_markers = {
   "CD8+ Effector T Cell": ["GZMB", "GZMK", "PRF1", "IFNG"],
   "CD8+ Exhausted T Cell": ["PDCD1", "HAVCR2", "LAG3", "TOX"],
   "CD8+ Memory T Cell": ["CCR7", "CD27", "SELL"],
   "CD8+ Naive T Cell": ["SELL", "CCR7", "LEF1"],
   "CD8+ Terminal Effector T Cell": ["ZEB2", "GZMB", "IFNG", "TBX21"],
   "CD8+ Tissue-Resident Memory T Cell": ["ITGAE", "CXCR6", "ZNF683"],
   "CD4+ Th1 Cell": ["STAT4", "CXCR3", "IFNG"],
   "CD4+ Th2 Cell": ["GATA3", "CCR4", "PTGDR2"],
   "CD4+ Th17 Cell": ["IL17A", "IL17F", "RORC", "KLRB1", "CCR6"],
   "CD4+ Th22 Cell": ["IL22", "CCR10", "FOXO4"],
   "CD4+ Naive Cell": ["CCR7", "SELL", "LEF1", "TCF7"],
   "CD4+ Central Memory T Cell": ["GPR183", "TCF7", "SELL"],  # "IL7R", "TCF7"
   "γδ T Cell (Vδ1)": ["TRDC"],
   "T-reg": ["FOXP3", "IL2RA", "CTLA4", "TNFRSF18"], #"IKZF2", 
   "Double Positive CD4+CD8+ T Cell": ["CD4", "CD8A", "CD8B"],
   "Tumor-Associated Endothelial Cell": ["DDIT4", "TIE1", "SEMA6B", "PLCB1", "LYZ"],
   "Vascular Endothelial Cell": ["PECAM1", "CDH5", "PLVAP", "EHD4", "CLEC14A"],
   "Lymphatic Endothelial Cell": ["PROX1", "PDPN", "LYVE1", "FLT4"],
    "B Cell - Naive": ["IL7R", "IGHM", "TCL1A", "CD19"],
    "B Cell - Activated": ["IGHM", "CD69", "CD86"],
    "B Cell - Memory": ["CD27", "IGHE", "IGHA1"],
    "B-reg": ["TFRC", "CD44", "TGFB1"],
    "Plasma Cell": ["MZB1", "XBP1", "PRDM1", "SDC1"],
    "Plasmablast": ["CD27", "CD38", "PRDM1", "IGHG1", "MKI67"],
    "B Cell - Germinal Center": ["AICDA", "BCL6", "RGS13", "S1PR2"],
    "Malignant Cell - Epithelial": ["EPCAM", "CLDN4", "CLDN7"],
    "Malignant Cell - Pit Like": ["GKN1", "GKN2", "CLDN18"],
    "Malignant Cell - Hypoxia": ["HIF1A", "VEGFA", "CA9"],
    "Malignant Cell - Highly Proliferative": ["MKI67", "CENPF", "TOP2A"],
    "Malignant Cell - EMT": ["ZEB1", "TWIST1", "CDH2"],
    "Malignant Cell - Acinar-like": ["REG3A", "REG3G", "CPA1"],
    "Malignant Cell - Invasive": ["MMP9", "MMP2", "MMP14"],
    "Malignant Cell - Senescence": ["CDKN1A", "CDKN2A", "LMNA"],
    "Malignant Cell - Apoptotic": ["BAX", "BCL2", "FAS"],
    "Malignant Cell - Mesenchymal": ["THY1", "COL3A1", 'FN1'],
    "iCAF": ["CCL2", "CXCL12", "PDGFRA"],
    "myCAF": ["ACTA2", "TAGLN", "S100A4"],
    "Monocyte": ["DPYD", "ELMO1"],
    "Macrophage - C1Q+ TAM": ["C1QA", "CD163"],
    "Macrophage - CD74+ antigen processing TAM": ["CD74", "IL1B"],
    "Macrophage - angiogenic TAM": ["VCAN", "SLC2A3", "SPP1"],
    "Macrophage - lipid processing TAM": ["APOE", "TREM2"],
    "Macrophage - CD3+ TAM": ["CD3E", "CD2"],
    "Dendritic Cell - cDC1": ["ITGAE", "BTLA", "CADM1"],
    "Dendritic Cell - cDC2": ["CD1C", "CLEC10A", "SIRPA"],
    "Dendritic Cell - Activated": ["CD83", "CD86", "CD80"],
    "Dendritic Cell - pDC": ["IL3RA", "CLEC4C", "TCF4", "IFNB1"],
    "Mast Cell": ["TPSAB1", "KIT", "CPA3"],
    "Neutrophil - N0": ["S100A8", "S100A9"],
    "Neutrophil - N1": ["TNF", "LCN2"],
    "Neutrophil - N2": ["THBS1", "VEGFA"],}

In [None]:
for celltype, markers in all_markers.items():
    print(f'Calculating Score for {celltype}')
    score_name = celltype.replace(' ', '_').replace('-','_').replace('___','_') + '_score'
    sc.tl.score_genes(adata, score_name=score_name, gene_list=markers, layer='log_norm')
    print('-'*100)

In [None]:
sc.tl.score_genes(adata, score_name='Macrophage_CD3+_TAM_score', gene_list=["CD3E", "CD2", "CD68", "CD14", "ITGAM", "CSF1R"], layer='log_norm')
sc.tl.score_genes(adata, score_name='Macrophage_angiogenic_TAM_score', gene_list=["VCAN", "SLC2A3", "SPP1"], layer='log_norm')

In [None]:
sc.tl.score_genes(adata, score_name='Dendritic_Cell_pDC_score', gene_list=["IL3RA", "CLEC4C", "TCF4", "IFNB1"], layer='log_norm')
sc.tl.score_genes(adata, score_name='Dendritic_Cell_cDC1_score', gene_list=["ITGAE", "BTLA", "CADM1", "XCR1"], layer='log_norm')

In [None]:
adata.obs.to_csv('obs_scored.csv')

In [None]:
score_cols = ([i for i in adata.obs.columns if 'score' in i])

In [None]:
subset = ['CD8+ Effector T Cell', 'CD8+ Exhausted T Cell', 'CD8+ Memory T Cell', 'CD8+ Naive T Cell', 'CD8+ Terminal Effector T Cell', 'CD8+ Tissue-Resident Memory T Cell', 
          'CD4+ Th1 Cell', 'CD4+ Th2 Cell', 'CD4+ Th17 Cell', 'CD4+ Th22 Cell', 'CD4+ Naive T Cell', 'CD4+ Memory T Cell', 'γδ T Cell (Vδ1)', 'T-reg', 'Double Positive CD4+CD8+ T Cell', 
          'Tumor-Associated Endothelial Cell', 'Vascular Endothelial Cell', 'Lymphatic Endothelial Cell', 'B Cell - Naive', 'B Cell - Activated', 'B Cell - Memory', 'B-reg', 'Plasma Cell', 
          'Plasmablast', 'B Cell - Germinal Center', 'Malignant Cell - Epithelial', 'Malignant Cell - Pit Like', 'Malignant Cell - Hypoxia', 'Malignant Cell - Highly Proliferative', 
          'Malignant Cell - EMT', 'Malignant Cell - Acinar-like', 'Malignant Cell - Highly Invasive', 'Malignant Cell - Senescence', 'Malignant Cell - Apoptotic', 'Malignant Cell - Mesenchymal', 
          'iCAF', 'myCAF', 'Monocyte', 'Macrophage - C1Q+ TAM', 'Macrophage - CD74+ antigen processing TAM', 'Macrophage - angiogenic TAM', 'Macrophage - lipid processing TAM', 'Macrophage - CD3+ TAM', 
          'Dendritic Cell - cDC1', 'Dendritic Cell - cDC2', 'Dendritic Cell - Activated', 'Dendritic Cell - pDC', 'Mast Cell', 'Neutrophil - N0', 'Neutrophil - N1', 'Neutrophil - N2']

In [None]:
sc.set_figure_params(dpi_save=300)

In [None]:
os.chdir("/mnt/storage/Shrey/PDAC_Downstream/Supplementary")

In [None]:
sc.pl.dotplot(adata[adata.obs.Level_4.isin(subset)], groupby='Level_4', var_names=score_cols, standard_scale='var', categories_order=subset, vmin=0.7)

In [None]:
from tqdm import tqdm

In [None]:
for score_col in tqdm(score_cols):
    print(score_col)
    figgen.plot_umap_scores(adata, scores=score_col, figure_name=score_col)
    print('_'*100)

In [None]:
# macro = adata[adata.obs.Level_3.str.contains('Macro')]

# sc.tl.rank_genes_groups(macro, layer='log_norm', groupby='Level_4')

# sc.pl.rank_genes_groups(macro)

# sc.get.rank_genes_groups_df(macro, group='Macrophage - CD3+ TAM').head(20)['names'].tolist()

# sc.pl.dotplot(macro, groupby='Level_4', var_names=['IL32',
#  'CD3D',
#  'PTPRC',
#  'CD2',
#  'CCL5',
#  'CD3E',
#  'BTG1',
#  'IL7R',
#  'CD3G',
#  'ETS1',
#  'ZFP36L2',
#  'CXCR4',
#  'CD69',
#  'TRBC2',
#  'TRAC',
#  'RPL13',
#  'GZMA',
#  'RPS7',
#  'RPS15A',
#  'RPL10'], layer='log_norm', standard_scale='var')



In [None]:
adata.obs.TreatmentType.value_counts()

In [None]:
adata.obs.groupby('TreatmentType')['Dataset'].nunique()

In [None]:
adata.obs['Dataset_ID'] = adata.obs.Dataset.astype(str) + adata.obs.Sample_ID.astype(str)

In [None]:
for celltype in tqdm(adata.obs.Level_2.unique().tolist()):
    cell_to_plot = adata[adata.obs.Level_2 == celltype].obs.Level_4.unique().tolist()
    print(f'L2 {celltype}: {cell_to_plot}')
    figgen.create_masked_umap_highlight(adata, mask_column="Level_4", figure_name=celltype, mask_values=cell_to_plot, color_by="Level_4")

In [None]:
for celltype in tqdm(adata.obs.Level_3.unique().tolist()):
    cell_to_plot = adata[adata.obs.Level_3 == celltype].obs.Level_4.unique().tolist()
    print(f'L2 {celltype}: {cell_to_plot}')
    figure_name = celltype + '_L3'
    figgen.create_masked_umap_highlight(adata, mask_column="Level_4", figure_name=figure_name, mask_values=cell_to_plot, color_by="Level_4")

In [None]:
pwd

In [None]:
missing_celltypes = [
    "Acinar (REG+) Cell",
    "Acinar Idling Cell",
    "Alpha Cell",
    "Acinar Cell",
    "Pericyte",
    "Smooth Muscle Cell"
]

In [None]:
for celltype in tqdm(missing_celltypes):
    cell_to_plot = adata[adata.obs.Level_3 == celltype].obs.Level_4.unique().tolist()
    print(f'L2 {celltype}: {cell_to_plot}')
    figure_name = celltype + '_L3'
    figgen.create_masked_umap_highlight(adata, mask_column="Level_4", figure_name=figure_name, mask_values=cell_to_plot, color_by="Level_4")

In [None]:
# figgen.plot_sankey(adata, levels=['Level_2','Level_3', 'Level_4'], save_name='Sankey')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product

def boxplot_cells_per_level(
    adata,
    level_col,                 # "Level_3" or "Level_4"
    dataset_col,               # e.g. "Dataset", "Study", or your per-dataset column
    normalize=False,           # True -> percentage per dataset; False -> counts
    include_zeros=True,        # include zeros for (dataset, celltype) pairs not present
    order=None,                # fixed order of categories on x-axis
    figsize=(21, 10),
    title=None,
    save_path=None
):
    # base counts
    df = (
        adata.obs[[dataset_col, level_col]]
        .dropna()
        .groupby([dataset_col, level_col])
        .size()
        .reset_index(name="n")
    )

    # ensure zeros for absent combos (so each dataset contributes to each cell type)
    if include_zeros:
        datasets = df[dataset_col].unique()
        cats = order if order is not None else adata.obs[level_col].dropna().astype(str).unique()
        full = pd.DataFrame(list(product(datasets, cats)), columns=[dataset_col, level_col])
        df = full.merge(df, how="left", on=[dataset_col, level_col]).fillna({"n": 0})

    # counts -> percent per dataset (optional)
    if normalize:
        totals = df.groupby(dataset_col)["n"].transform("sum").replace({0: 1})
        df["value"] = df["n"] / totals * 100.0
        ylab = "Cells (%)"
    else:
        df["value"] = df["n"]
        ylab = "Cells (count)"

    # ordering on x
    if order is None:
        # sort by global median to make the plot readable
        med = df.groupby(level_col)["value"].median().sort_values(ascending=False)
        order = med.index.tolist()

    # plot
    plt.figure(figsize=figsize)
    ax = sns.boxplot(data=df, x=level_col, y="value")
    ax.set_xlabel(level_col.replace("_", " "))
    ax.set_ylabel(ylab)
    if title is None:
        t_norm = " (% of cells per dataset)" if normalize else " (cells per dataset)"
        ax.set_title(f"Distribution across datasets: {level_col}{t_norm}")
    else:
        ax.set_title(title)

    # aesthetics
    ax.tick_params(axis='x', rotation=90)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
        print(f"Saved: {save_path}")

    return ax

# --- Usage examples ---
# Choose your dataset column name in adata.obs:
dataset_col = "Dataset"  # change if your column is named e.g. "Study" or "Sample_batch"

# Level_3 counts
boxplot_cells_per_level(adata, level_col="Level_3", dataset_col=dataset_col,
                        normalize=False, include_zeros=True,
                        save_path="boxplot_Level3_counts.png")

# Level_3 percentages
boxplot_cells_per_level(adata, level_col="Level_3", dataset_col=dataset_col,
                        normalize=True, include_zeros=True,
                        save_path="boxplot_Level3_percent.png")

# Level_4 counts
boxplot_cells_per_level(adata, level_col="Level_4", dataset_col=dataset_col,
                        normalize=False, include_zeros=True,
                        save_path="boxplot_Level4_counts.png")

In [None]:

# 1) Get counts per Level_4 (drop NaNs)
counts = (
    adata.obs['Level_4']
    .dropna()
    .value_counts()                 # counts, sorted desc by default
)

# 2) Bar plot (vertical)
plt.figure(figsize=(21, 12))
plt.bar(counts.index.astype(str), counts.values)
plt.ylabel('Cells (count)')
plt.xlabel('Level_4')
plt.title('Cell counts per Level_4')
plt.xticks(rotation=75, ha='right')
plt.tight_layout()
plt.show()