In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from pyclustree import clustree
import matplotlib.pyplot as plt

In [None]:
sc.set_figure_params(dpi=100)
pd.set_option('display.max_rows', 10)

In [None]:
adata = sc.read_h5ad('final_scanVI/final_object.h5ad')

In [None]:
adata

In [None]:
cluster_keys = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_4_refined', 'Level_5']

In [None]:
sc.pl.umap(adata, color=cluster_keys, ncols=3, size=2, wspace=0.5, legend_loc='on data', legend_fontsize=6, legend_fontoutline=3, frameon=False)

In [None]:
adata_temp = adata[adata.obs.Level_4 == cell_type].copy()

In [None]:
adata_temp.obs.Level_5.unique()

In [None]:
for cell_type in adata.obs.Level_4.unique().tolist():
    adata_temp = adata[adata.obs.Level_4 == cell_type] #.copy()
    if adata_temp.obs.Level_5.nunique() == 1:
        print('-'*50)
        print(f'Skipping because only 1 unique Level 5 Cell Type in {cell_type}')
        print('-'*50)
        continue
    fig = clustree(
        adata_temp,
        cluster_keys,
        title=f'{cell_type}',
        show_fraction=True, 
        node_size_range=(5000, 10000), x_spacing = 2, 
        # edge_width_range=(5,10),
    )
    
    # Adjust figure size and resolution
    fig.set_size_inches(30, 30)
    fig.set_dpi(100)
    fig.tight_layout()
    fig.show()

In [None]:
adata_temp = adata[adata.obs.Level_4_refined.str.contains('CD3+')] #.copy()
fig = clustree(
    adata_temp,
    cluster_keys,
    title="Clustree",
    show_fraction=True, 
    node_size_range=(5000, 10000), x_spacing = 2, 
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(30, 30)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

In [None]:
adata_temp = adata[adata.obs.Level_5.str.contains('CD3+')] #.copy()
fig = clustree(
    adata_temp,
    cluster_keys,
    title="Clustree",
    show_fraction=True, 
    node_size_range=(5000, 10000), x_spacing = 2, 
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(30, 30)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

# Fix CD3+ in Level_5

In [None]:
mask = [adata.obs.Level_4_refined.str.contains('CD3+') != adata.obs.Level_5.str.contains('CD3+')]

In [None]:
adata.obs.loc[mask, 'Level_5'] = 'Macrophage - CD3+ TAM'

In [None]:
adata_temp = adata[adata.obs.Level_4_refined.str.contains('CD3+')] #.copy()
fig = clustree(
    adata_temp,
    cluster_keys,
    title="Clustree",
    show_fraction=True, 
    node_size_range=(5000, 10000), x_spacing = 2, 
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(30, 30)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

In [None]:
adata_temp = adata[adata.obs.Level_5.str.contains('CD3+')] #.copy()
fig = clustree(
    adata_temp,
    cluster_keys,
    title="Clustree",
    show_fraction=True, 
    node_size_range=(5000, 10000), x_spacing = 2, 
    # edge_width_range=(5,10),
)

# Adjust figure size and resolution
fig.set_size_inches(30, 30)
fig.set_dpi(100)
fig.tight_layout()
fig.show()

# UMAP

In [None]:
cluster_keys = ['Level_1', 'Level_2', 'Level_3', 'Level_4_refined', 'Level_5']

In [None]:
sc.pl.umap(adata, color=cluster_keys, ncols=3, size=2, wspace=0.5, legend_loc='on data', legend_fontsize=2, legend_fontoutline=3, frameon=False)

# Load the DF

In [None]:
df_map = pd.read_csv('Level_4_to_Level_1.csv', index_col=None, sep=';')

In [None]:
df_map

In [None]:
obs = adata.obs.copy()

In [None]:
obs.drop(['Level_1', 'Level_2', 'Level_3', 'Level_4_refined', 'Level_4'], axis=1, inplace=True)

In [None]:
obs['Level_1'] = obs.Level_5.map(dict(zip(df_map.Level_4,df_map.Level_1)))
obs['Level_2'] = obs.Level_5.map(dict(zip(df_map.Level_4,df_map.Level_2)))
obs['Level_3'] = obs.Level_5.map(dict(zip(df_map.Level_4,df_map.Level_3)))

In [None]:
obs.rename(columns={'Level_5': 'Level_4'}, inplace=True)

In [None]:
obs = obs[['Barcode', 'Dataset', 'ID_batch_covariate', 'Unique_ID', 'Technology',
       'n_genes', 'n_counts', 'log_counts', 'mt_frac', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito',
       'leiden', 'batch', 'leiden_0.2', 'leiden_0.2_annotation',
       'leiden_subcluster', 'level0_leiden_subcluster', 'leiden_0.5',
       'is_outlier_total_counts', 'outlier', 'infercnv_score_malignant',
       'infercnv_score_malignant_refined', 'cnv_score_abs', 'treatment_status',
       'Level_0', 'MALAT1_lognorm', 'empty_droplet', 'ID_harmonised',
       'Dataset_unique', 'Tissue', 'Age', 'Sex', 'Diabetes', 'Treatment',
       'Global_Leiden', 'Treatment_Harmonized', 'Treatment_Category',
       'Myeloid_leiden_0.75', 'Fibroblast_leiden_0.75', 'Lymphoid_leiden_0.75',
       'Endothelial_Cell_leiden_0.75', 'Malignant_leiden_0.75',
       'Ductal_Cell_leiden_0.75', 'Schwann_Cell_leiden_0.75',
       'Adipocyte_leiden_0.75', 'Endocrine_Cell_leiden_0.75',
       'Acinar_Cell_leiden_0.75', 'Pericyte_leiden_0.75',
       'Smooth_Muscle_Cell_leiden_0.75', 'NK_Cell_leiden_0.75', 'Condition',
       'combo', 'EMT category', 'EMT score', 'EMT_score_DL',
       'Suspicious_Normal',  'Level_1', 'Level_2', 'Level_3', 'Level_4', '_scvi_batch', '_scvi_labels',
      ]]

In [None]:
obs.head()

In [None]:
adata.obs = obs.copy()

In [None]:
sc.pl.umap(adata, color=[ 'Level_1', 'Level_2', 'Level_3', 'Level_4'], ncols=2, frameon=False, legend_fontsize=5, wspace=0.75)

In [None]:
cluster_keys =  ['Level_1', 'Level_2', 'Level_3', 'Level_4']

In [None]:
for cell_type in adata.obs.Level_1.unique().tolist():
    adata_temp = adata[adata.obs.Level_1== cell_type] #.copy()
    fig = clustree(
        adata_temp,
        cluster_keys,
        title=f'{cell_type}',
        show_fraction=True, 
        node_size_range=(5000, 5000), x_spacing = 2)
    # Adjust figure size and resolution
    fig.set_size_inches(30,30)
    fig.set_dpi(100)
 # Reduce font size of all text elements
    for ax in fig.axes:
        for text in ax.texts:
            text.set_fontsize(4) 
    fig.tight_layout()
    fig.show()

In [None]:
adata.obs = adata.obs.replace('Adypocyte', 'Adipocyte')

In [None]:
pd.set_option('display.max_rows', None)
adata.obs.groupby(['Level_4', 'Level_2']).size().unstack()

In [None]:
adata

# Add to Anndata with All Genes

In [None]:
adata_all_genes = sc.read_h5ad('2025_05_20_refined_annotation.h5ad')

In [None]:
adata_all_genes

In [None]:
adata_all_genes.obs = adata.obs.copy()
adata_all_genes.obsm = adata.obsm.copy()
adata_all_genes.obsp = adata.obsp.copy()
adata_all_genes.uns = adata.uns.copy()

In [None]:
sc.pl.umap(adata_all_genes, color=[ 'Level_1', 'Level_2', 'Level_3', 'Level_4'], ncols=2, frameon=False, legend_fontsize=5, wspace=0.75)

In [None]:
adata.write('final_scanVI/final_object_mg.h5ad')
adata_all_genes.write('final_scanVI/final_object_all_genes.h5ad')

In [None]:
obs = adata.obs.copy()

In [None]:
obs.to_csv('final_object_obs.csv')

In [None]:
pwd