In [None]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
from matplotlib.pyplot import rcParams
import matplotlib.pyplot as plt
import seaborn as sb
import sys
import zarr

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
adata = sc.read_h5ad('/lustre/groups/ml01/workspace/shrey.parikh/PDAC_Work_Dir/PDAC_Final/Binned_Data/adata_mg_binned.h5ad')

In [None]:
adata

In [None]:
for i in ['X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_harmony', 'X_pca', 'X_scpoli', 'X_umap', 'X_umap_int', 'bin_edges', 'scPoli_emb', 'scPoli_umap']:
    del adata.obsm[i]

In [None]:
adata.X = adata.layers['binned_data']

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pl.pca(adata, color=['Technology', 'Dataset' , 'Level_1_refined'], wspace=0.65, frameon=False)

In [None]:
adata

In [None]:
zarr_files = []
for subdir in os.listdir():
    if os.path.isdir(subdir):
        for subsubdir in os.listdir(subdir):
            if 'zarr' in subsubdir:
                zarr_files.append(os.path.join(subdir, subsubdir))
    else:
        continue
    # for subsubdir in os.listdir(subdir):
    #     if 'zarr' in subsubdir:
    #         zarr_files.append(os.path.join(subdir, subsubdir))

In [None]:
zarr_files

In [None]:
GREEN_BOLD = "\033[1;32m"
BLUE_BOLD = "\033[1;34m"
RESET = "\033[0m"

for name in ['emb', 'umap']:
    for file, prefix in zip(zarr_files, ['scpoli', 'scanvi', 'expimap', 'sysvi', 'scvi', 'drvi']):
        # Open the Zarr dataset
        zarr_data = zarr.open(file, mode="r")
        attribute = 'X_' + name
        print(attribute)
        if "obs" in zarr_data:
            obs_columns = list(zarr_data["obs"].keys())
            print(f"Columns in 'obs' for {file}: {obs_columns}")
        else:
            print(f"'obs' not found in {file}")

        if "obsm" in zarr_data and attribute in zarr_data["obsm"]:
            if attribute == 'X_emb':
                # Load sparse matrix (if applicable)
                adata.obsm[f'{prefix}_{name}'] = zarr_data["obsm"][attribute][:]
                print(f"{GREEN_BOLD}Loaded sparse matrix '{attribute}' from {file} into adata.obsm['{prefix}_{name}']{RESET}")
            else:
                # Load dense data for other attributes (e.g., X_umap)
                adata.obsm[f'{prefix}_{name}'] = zarr_data["obsm"][attribute][:]
                print(f"{BLUE_BOLD}Loaded sparse matrix '{attribute}' from {file} into adata.obsm['{prefix}_{name}']{RESET}")
        else:
            print(f"'{attribute}' not found in 'obsm' of {file}")

# UMAPs

In [None]:
names =  ['scpoli', 'scanvi', 'expimap', 'sysvi', 'scvi', 'drvi']

In [None]:
pwd

In [None]:
umap_keys = [i + '_umap' for i in names]
titles = [i + ' UMAP' for i in names]
color_params = ['Dataset', 'Technology', 'Level_1_refined']
ncols = len(color_params)
nrows = len(umap_keys)
random_indices = random_indices = np.random.permutation(list(range(adata.shape[0])))

fig, axs = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 4))

for row_idx, (umap_key, title) in enumerate(zip(umap_keys, titles)):
    adata.obsm['X_umap'] = adata.obsm[umap_key]
    for col_idx, color in enumerate(color_params):
        ax = axs[row_idx, col_idx] if nrows > 1 else axs[col_idx]
        sc.pl.umap(
            adata[random_indices, :],
            color=color,
            frameon=False,
            title=f"{title} - {color}",
            ax=ax,
            show=False,
        )

plt.tight_layout()
plt.savefig('manual_genes_extended_umap_all.png')
plt.show()

# Save

In [None]:
adata.write('adata_mg_binned_all_embeddings.h5ad')

# benchmark

In [None]:
pwd

In [None]:
embeddings = [i for i in adata.obsm if 'emb' in i]

In [None]:
embeddings.append('X_pca')

In [None]:
embeddings

In [None]:
# subset = sc.pp.subsample(adata, fraction=0.1, copy=True)

In [None]:
# subset

In [None]:
adata

In [None]:
import pandas as pd

In [None]:
df_metrics = pd.read_csv('metrics/df_metrics_gpu.csv')

In [None]:
df_metrics

# Subcluster

In [None]:
def subcluster_celltype(adata, cell_type, embed):
    adata_temp = adata[adata.obs.Level_1_refined.str.contains(cell_type)].copy()
    sc.pp.neighbors(adata_temp, use_rep=embed, metric='cosine', n_neighbors=round(adata_temp.n_obs/100))
    sc.tl.leiden(adata_temp, resolution=0.5)
    sc.tl.umap(adata_temp, min_dist=0.75)
    sc.pl.umap(adata_temp, color=['Dataset', 'Technology', 'leiden'], frameon=False, wspace=0.5)
    return adata_temp

In [None]:
acinars = {}
for embed in embeddings:
    print(f'Subclustering Acinar Cells for with embedding: {embed}')
    acinars[embed] = subcluster_celltype(adata, cell_type='Acinar', embed=embed)

In [None]:
acinars