In [None]:
import sys
import pandas as pd
import numpy as np
import anndata
import scvi
import scanpy as sc
import matplotlib
import os

seed = 10

scvi.settings.seed = 10

sc.logging.print_versions()

path_to_save = ".../Atlas/human_extension/scVI/A_Ext_II_No2"



In [None]:
adata = sc.read(".../Atlas/Atlas_human_extension_II.h5ad")
adata

In [None]:

# Step 1: Read Barcodes from CSV Files
folder_path = '.../Atlas/human_extension/Cleaning_II/removed_barcodes'  # Replace with your folder path
all_barcodes = set()

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        barcodes = pd.read_csv(file_path, header=0)
        barcodes = barcodes["Barcodes"].tolist()
        all_barcodes.update(barcodes)


# Step 3: Filter the AnnData Object
adata = adata[~adata.obs_names.isin(all_barcodes)].copy()

adata = adata[~adata.obs.doubletfinder.isin(["Doublet"])].copy()


In [None]:
#change batch key to species

adata.layers["counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata # freeze the state in `.raw`
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=3000,
    subset=True,
    layer="counts",
    flavor="seurat_v3",
    batch_key="proj")

In [6]:


adata.write(".../Atlas/Atlas_human_extension_II_3000HVG_cleaned2.h5ad")



In [7]:

scvi.model.SCVI.setup_anndata(
    adata,
    batch_key="proj",
    layer="counts", categorical_covariate_keys=["orig_ident"],
    continuous_covariate_keys=["percent_mt"])
model = scvi.model.SCVI(adata)
model
vae = scvi.model.SCVI(adata, n_layers=3, n_latent=30, gene_likelihood="nb", dropout_rate=0.1)
vae.train(max_epochs = 600, plan_kwargs={"lr":0.001}, early_stopping = True, early_stopping_patience = 15)
model = vae

I0000 00:00:1701831921.485895  338380 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 393/600:  66%|██████▌   | 393/600 [3:46:25<1:59:15, 34.57s/it, v_num=1, train_loss_step=913, train_loss_epoch=951]
Monitored metric elbo_validation did not improve in the last 15 records. Best score: 948.971. Signaling Trainer to stop.


In [8]:
model.save(path_to_save, overwrite = True)

In [9]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent
adata.layers["scvi_normalized"] = model.get_normalized_expression(
    library_size=10e4)

In [10]:
sc.pp.neighbors(adata, n_pcs=30, use_rep="X_scVI", random_state=seed)
sc.tl.umap(adata, min_dist=0.3, random_state=seed)

In [12]:

adata.write(".../Atlas/Atlas_human_extension_II_3000HVG_cleaned2_integrated.h5ad")