In [None]:
import sys
import pandas as pd
import numpy as np
import anndata
import scvi
import scanpy as sc
import matplotlib
import os

seed = 10

scvi.settings.seed = 10

sc.logging.print_versions()



In [None]:
adata = sc.read(".../Atlas/Atlas_human_extension_II_3000HVG_integrated_cleaned1.h5ad")
adata

In [None]:
sc.pl.umap(adata, color=['proj'])
sc.pl.umap(adata, color=['annotation_final_level1B'], legend_loc = "on data")
sc.pl.umap(adata, color=['annotation_final_level1'], legend_loc = "on data")
sc.pl.umap(adata, color=['leiden_scVI_0_5'], legend_loc = "on data")



In [None]:
adata.obs["annotation_level0"] = adata.obs.leiden_scVI_0_5

label_mapping = {
    '0': 'TAL_MD',
    '1': 'PT',
    '2': 'DT',
    '3': 'EC_Stromal',
    '4': 'PT',
    '5': 'EC_Stromal',
    '6': 'DT',
    '7': 'TAL_MD',
    '8': 'IC',
    '9': 'TL',
    '10': 'Immune',
    '11': 'Immune',
    '12': 'Podo',
    '13': 'EC_Stromal',
    '14': 'IC',
    '15': 'PEC',
    '16': 'DT',
    '17': 'EC_Stromal',
    '18': 'EC_Stromal',



}

# Replace old labels with new annotations in adata.obs["annotation_level1"]
adata.obs["annotation_level0"] = adata.obs["annotation_level0"].replace(label_mapping)



In [None]:
cells_keep = adata.obs_names.unique().tolist()

In [None]:
adata_raw = sc.read(".../Atlas/Atlas_human_extension_II.h5ad")

adata_raw

In [None]:
adata_raw = adata_raw[adata_raw.obs_names.isin(cells_keep)]

adata_raw.obs["annotation_level0"] = adata.obs["annotation_level0"]

In [None]:
celltypes = ["Podo"]

#celltypes = adata_raw.obs.annotation_level0.unique().tolist()

for celltype in celltypes: 
    
    directory = f'.../Atlas/human_extension/Cleaning_II/Sub_cleaning/{celltype}'
    os.makedirs(directory, exist_ok=True)
    
    path_to_save = f'.../Atlas/human_extension/Cleaning_II/Sub_cleaning/{celltype}/scVI'

    path_to_save_anndata = f'.../Atlas/human_extension/Cleaning_II/Sub_cleaning/{celltype}/{celltype}_subcluster.h5ad'

    adata = adata_raw[adata_raw.obs.annotation_level0==celltype].copy()

    adata.layers["counts"] = adata.X.copy() # preserve counts
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata # freeze the state in `.raw`
    sc.pp.highly_variable_genes(
        adata,
        n_top_genes=3000,
        subset=True,
        layer="counts",
        flavor="seurat_v3",
        #batch_key="proj", 
        span = 0.5) #we increase span based on the experienced "There are other near singularities as well" error

    scvi.model.SCVI.setup_anndata(
        adata,
        batch_key="proj",
        layer="counts", categorical_covariate_keys=["orig_ident"],
        continuous_covariate_keys=["percent_mt"])
    model = scvi.model.SCVI(adata)
    model
    vae = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dropout_rate=0.1)
    vae.train(max_epochs = 600, plan_kwargs={"lr":0.001}, early_stopping = True, early_stopping_patience = 15)
    model = vae

    model.save(path_to_save, overwrite = True)

    latent = model.get_latent_representation()
    adata.obsm["X_scVI"] = latent
    adata.layers["scvi_normalized"] = model.get_normalized_expression(
        library_size=10e4)

    sc.pp.neighbors(adata, n_pcs=50, use_rep="X_scVI", random_state=seed)
    
    sc.tl.umap(adata, min_dist=0.3, random_state=seed)

    sc.pl.umap(adata, color=['proj'])

    sc.tl.leiden(adata, key_added="leiden_scVI_3_0", resolution=3.0, random_state=seed)

    sc.pl.umap(adata, color=['leiden_scVI_3_0'], legend_loc='on data')

    sc.set_figure_params(figsize=(15,5))

    sc.pl.violin(adata, keys='nCount_RNA', groupby='leiden_scVI_3_0', rotation=90)
    sc.pl.violin(adata, keys='nFeature_RNA', groupby='leiden_scVI_3_0', rotation=90)
    sc.pl.violin(adata, keys='percent_mt', groupby='leiden_scVI_3_0', rotation=90)

    sc.set_figure_params(figsize=(5,5))

    adata.write(path_to_save_anndata)