# PS Anndata for All Cells

In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
adata = ad.read_zarr('/mnt/ssd/atlases/Human_Atlas_Harmonised.zarr')
ps_bulk_adata = sc.read_h5ad('../ps_bulk_adata.h5ad')
sc.pl.umap(ps_bulk_adata, color='leiden')

In [None]:
ps_bulk_adata.obs.Dataset.value_counts()
leiden_map = dict(zip(ps_bulk_adata.obs.Dataset_ID, ps_bulk_adata.obs.leiden))

# leiden_map

In [None]:
adata.obs['Dataset_ID'] = adata.obs.Dataset.astype(str) + '_' + adata.obs.Sample_ID.astype(str)
composition = pd.crosstab(adata.obs['Dataset_ID'], adata.obs['Level_4'])
composition_prop = composition.div(composition.sum(axis=1), axis=0) * 100
composition_prop = composition_prop[composition_prop.index.isin(ps_bulk_adata.obs.Dataset_ID)]
composition_prop

In [None]:
pd.set_option('display.max_columns', None)
composition_prop.sum(axis=1)
ps_adata_all_cells = ad.AnnData(X=composition_prop.values.copy(), obs=composition_prop.index.to_frame(index=False), var=pd.DataFrame(index=composition_prop.columns))
patient_meta = adata.obs[['Dataset_ID', 'Dataset', 'Technology', 'Treatment', 'TreatmentType']].drop_duplicates(subset='Dataset_ID')
patient_meta = patient_meta.set_index('Dataset_ID')
ps_adata_all_cells.obs = ps_adata_all_cells.obs.set_index('Dataset_ID').join(patient_meta, how='left')
ps_adata_all_cells.obs.index.name = None  # optional: remove index name
ps_adata_all_cells.var['Level_1'] = ps_adata_all_cells.var_names.map(dict(zip(adata.obs.Level_4,adata.obs.Level_1)))
ps_adata_all_cells.var['Level_2'] = ps_adata_all_cells.var_names.map(dict(zip(adata.obs.Level_4,adata.obs.Level_2)))
ps_adata_all_cells.var['Level_3'] = ps_adata_all_cells.var_names.map(dict(zip(adata.obs.Level_4,adata.obs.Level_3)))
ps_adata_all_cells.obs['leiden'] = ps_adata_all_cells.obs_names.map(leiden_map)
ps_adata_all_cells

In [None]:
ps_adata_all_cells.obs

In [None]:
ps_adata_all_cells.write('../ps_adata_all_cells.h5ad')

# PS Anndata for All Cells - Mean Pseudobulked at Genes

In [None]:
subset = adata[adata.obs.Dataset_ID.isin(ps_adata_all_cells.obs_names)]

In [None]:
subset

In [None]:
import scipy.sparse as sp
from tqdm import tqdm 

obs_df = subset.obs[['Dataset_ID']].copy()
X_sparse = subset.layers['log_norm'] 
obs_idx_map = dict(zip(subset.obs_names, range(subset.n_obs)))
grouped = {
    dsid: [obs_idx_map[name] for name in names]
    for dsid, names in obs_df.groupby('Dataset_ID').groups.items()}
pseudobulk_data = []
bulk_index = []
for dataset_id, indices in tqdm(grouped.items(), desc="Aggregating pseudobulk"):
    sub_X = X_sparse[indices, :]
    mean_expr = sp.csr_matrix(sub_X.mean(axis=0))
    pseudobulk_data.append(mean_expr)
    bulk_index.append(dataset_id)

pseudobulk_matrix = sp.vstack(pseudobulk_data)
ps_adata_all_cells_bulk = ad.AnnData(
    X=pseudobulk_matrix,
    obs=pd.DataFrame(index=bulk_index),
    var=subset.var.copy())

In [None]:
ps_adata_all_cells_bulk

In [None]:
patient_meta = subset.obs[['Dataset_ID', 'Dataset', 'Technology', 'Treatment', 'TreatmentType']].drop_duplicates(subset='Dataset_ID')
patient_meta = patient_meta.set_index('Dataset_ID')
ps_adata_all_cells_bulk.obs = ps_adata_all_cells_bulk.obs.join(patient_meta, how='left')
ps_adata_all_cells_bulk = ps_adata_all_cells_bulk[ps_adata_all_cells_bulk.obs_names.isin(ps_adata_all_cells.obs_names)]
ps_adata_all_cells_bulk.obs['malignant_comp_leiden'] = ps_adata_all_cells_bulk.obs_names.map(dict(zip(ps_bulk_adata.obs.Dataset_ID, ps_bulk_adata.obs.leiden)))

In [None]:
ps_adata_all_cells_bulk.obs.reset_index(inplace=True)

In [None]:
ps_adata_all_cells_bulk.obs.rename(columns={'index': 'Dataset_ID'}, inplace=True)

In [None]:
ps_adata_all_cells_bulk.var["mt"] = ps_adata_all_cells_bulk.var_names.str.startswith("MT-")
ps_adata_all_cells_bulk.var["ribo"] = ps_adata_all_cells_bulk.var_names.str.startswith(("RPS", "RPL"))
ps_adata_all_cells_bulk.var["hb"] = ps_adata_all_cells_bulk.var_names.str.contains("^HB[^(P)]")
genes_to_remove = ps_adata_all_cells_bulk.var["mt"] | ps_adata_all_cells_bulk.var["ribo"] | ps_adata_all_cells_bulk.var["hb"]
ps_interesting_cells_bulk = ps_adata_all_cells_bulk[:, ~genes_to_remove]
sc.pp.filter_genes(ps_adata_all_cells_bulk, min_cells=10)

In [None]:
ps_adata_all_cells_bulk.obsm = ps_adata_all_cells[ps_adata_all_cells.obs_names.isin(ps_adata_all_cells_bulk.obs_names)].obsm.copy()

In [None]:
obs_extension = pd.DataFrame(ps_adata_all_cells.X, index=ps_adata_all_cells.obs_names, columns=ps_adata_all_cells.var_names)
obs_extension.reset_index(inplace=True)
obs_extension.rename(columns={'index': 'Dataset_ID'}, inplace=True)

In [None]:
obs_extension

In [None]:
ps_adata_all_cells_bulk.obs = pd.merge(ps_adata_all_cells_bulk.obs, obs_extension, on='Dataset_ID').copy()

In [None]:
ps_adata_all_cells_bulk.obs.head()

In [None]:
ps_adata_all_cells_bulk

In [None]:
ps_adata_all_cells_bulk.write('../ps_adata_all_cells_bulk.h5ad')