In [None]:
pwd

In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import os

In [None]:
adata = ad.read_zarr('/mnt/ssd/atlases/Human_Atlas_Harmonised.zarr/')
malignant = adata[adata.obs['Level_3'].str.startswith('Malignant')]
malignant.obs['Dataset_ID'] = malignant.obs.Dataset.astype(str) + '_' + malignant.obs.Sample_ID.astype(str)
composition = pd.crosstab(malignant.obs['Dataset_ID'], malignant.obs['Level_4'])
composition_prop = composition.div(composition.sum(axis=1), axis=0) * 100

In [None]:
ps_adata = ad.AnnData(X=composition_prop.values.copy(), obs=composition_prop.index.to_frame(index=False), var=pd.DataFrame(index=composition_prop.columns))
patient_meta = malignant.obs[['Dataset_ID', 'Dataset', 'Technology', 'Treatment', 'TreatmentType']].drop_duplicates(subset='Dataset_ID')
patient_meta = patient_meta.set_index('Dataset_ID')
ps_adata.obs = ps_adata.obs.set_index('Dataset_ID').join(patient_meta, how='left')
ps_adata.obs.index.name = None  
ps_adata.obs = ps_adata.obs.astype(str)
ps_adata.obs.reset_index(inplace=True)
ps_adata.obs.rename(columns={'index':'Dataset_ID'}, inplace=True)

In [None]:
# remove schalk and carpenter
ps_adata = ps_adata[~ps_adata.obs.Dataset.isin(['Carpenter_2023', 'Schalck_2022'])]

In [None]:
ps_adata

In [None]:
sc.pp.pca(ps_adata)
sc.pp.neighbors(ps_adata, metric='cosine', n_neighbors=20)
sc.tl.umap(ps_adata)
sc.tl.leiden(ps_adata, resolution=1)

In [None]:
sc.pl.pca(ps_adata, color='Technology')
sc.pl.umap(ps_adata, color=['leiden', 'Technology'])

In [None]:
# combat correct
ps_adata.layers['pre_combat'] = ps_adata.X.copy()
sc.pp.combat(ps_adata, key='Technology')

In [None]:
sc.pp.pca(ps_adata)
sc.pp.neighbors(ps_adata, metric='cosine', n_neighbors=20)
sc.tl.umap(ps_adata)
sc.tl.leiden(ps_adata, resolution=0.5)

In [None]:
sc.pl.pca(ps_adata, color='Technology')
sc.pl.umap(ps_adata, color=['leiden', 'Technology'])

In [None]:
ps_adata.write('ps_adata.h5ad')

# Add Leiden back

In [None]:
adata.obs['Dataset_ID'] = adata.obs.Dataset.astype(str) + '_' + adata.obs.Sample_ID.astype(str)
adata.obs['Patient_Cluster'] = adata.obs.Dataset_ID.map(dict(zip(ps_adata.obs.Dataset_ID, 'Patient_Cluster_' + ps_adata.obs.leiden.astype(str))))

# Create ps_bulk_adata

In [None]:
malignant = adata[adata.obs['Level_3'].str.startswith('Malignant')]

In [None]:
sc.pp.filter_genes(malignant, min_cells=10)

In [None]:
malignant.var["mt"] = malignant.var_names.str.startswith("MT-")
malignant.var["ribo"] = malignant.var_names.str.startswith(("RPS", "RPL"))
malignant.var["hb"] = malignant.var_names.str.contains("^HB[^(P)]")

In [None]:
genes_to_remove = malignant.var["mt"] | malignant.var["ribo"] | malignant.var["hb"]
malignant_filtered = malignant[:, ~genes_to_remove]

In [None]:
X = pd.DataFrame(malignant_filtered.layers['log_norm'].toarray(), index=malignant_filtered.obs_names, columns=malignant_filtered.var_names)
X['Dataset_ID'] = malignant_filtered.obs['Dataset_ID'].values
pseudobulk = X.groupby('Dataset_ID').mean()

In [None]:
ps_bulk_adata = ad.AnnData(
    X=pseudobulk.values,
    obs=pd.DataFrame(index=pseudobulk.index),  # patients
    var=pd.DataFrame(index=pseudobulk.columns)  # genes
)

In [None]:
# patient metadata
patient_meta = malignant_filtered.obs[['Dataset_ID', 'Dataset', 'Technology', 'Treatment', 'TreatmentType', 'Patient_Cluster']].drop_duplicates(subset='Dataset_ID')
patient_meta = patient_meta.set_index('Dataset_ID')
ps_bulk_adata.obs = ps_bulk_adata.obs.join(patient_meta, how='left')

In [None]:
ps_bulk_adata = ps_bulk_adata[ps_bulk_adata.obs_names.isin(ps_adata.obs.Dataset_ID)]

In [None]:
ps_bulk_adata

In [None]:
np.median(ps_bulk_adata.X)  # If values are around 0â€“10, likely log-normalized

In [None]:
ps_adata

In [None]:
ps_bulk_adata

In [None]:
sc.pp.pca(ps_adata)
sc.pl.pca(ps_adata, color=['Technology', 'leiden'], components=['1,2', '2,3'], wspace=0.5)

In [None]:
sc.pp.neighbors(ps_adata, metric='cosine', n_neighbors=30)

In [None]:
sc.tl.leiden(ps_adata, resolution=0.55)
sc.tl.umap(ps_adata)

In [None]:
del ps_adata.uns['leiden_colors']

In [None]:
sc.pl.umap(ps_adata, color=['Technology', 'leiden'], wspace=0.5)

# Trajectories

In [None]:
ps_adata

In [None]:
ps_bulk_adata

In [None]:
sc.tl.diffmap(ps_adata, n_comps=15)

In [None]:
# sc.pl.diffmap(ps_adata, color=['leiden', 'Technology'], components=['2,4'])

In [None]:
sc.pl.diffmap(ps_adata, color=['leiden'], components='all')

In [None]:
# del ps_adata.uns['leiden_colors']

In [None]:
sc.pl.diffmap(ps_adata, color=['leiden', 'Technology'], components=['2,4'])

In [None]:
sc.pl.diffmap(ps_adata, color=['Malignant Cell - Epithelial','Malignant Cell - Hypoxia', 'Malignant Cell - EMT', 'Malignant Cell - Senescence', 'Malignant Cell - Mesenchymal'], components=['2,4'], size=400, ncols=5)

In [None]:
sc.pl.diffmap(ps_adata, color=['leiden'] + list(ps_adata.var_names), components=['2,4'],size=400)

In [None]:
root_index = ps_adata.obs[ps_adata.obs['TreatmentType'] == 'Untreated'].index[79]  #79 or 87
ps_adata.obs['is_root'] = ps_adata.obs.index == root_index

In [None]:
ps_adata.obs[ps_adata.obs['is_root']]

In [None]:
ps_adata.obs.is_root = ps_adata.obs.is_root.map({True:1, False:0})

In [None]:
components=['2,4']
sc.pl.diffmap(ps_adata, color = 'is_root', groups='True', components=components)

# DPT

In [None]:
ps_adata.uns['iroot'] = ps_adata.obsm['X_diffmap'][:,2].argmax()
sc.tl.dpt(ps_adata, n_dcs=5)
sc.pl.diffmap(ps_adata, color = 'dpt_pseudotime', components=components)

In [None]:
ps_adata.write('ps_comp_malignant.h5ad')

In [None]:
ps_bulk_adata.write('ps_bulk_malignant.h5ad')