In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata
import scvi
import scanpy as sc
import matplotlib
import csv
import os
seed=10
os.chdir("/home/kloetzer/Atlas/objects")
sc.logging.print_versions()

Global seed set to 0


-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                 9.2.0
absl                NA
asttokens           NA
attr                22.1.0
backcall            0.2.0
cffi                1.15.1
chex                0.1.5
colorama            0.4.5
constants           NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.3
decorator           4.4.2
defusedxml          0.7.1
deprecate           0.3.2
docrep              0.3.2
entrypoints         0.4
etils               0.8.0
executing           1.0.0
flax                0.6.0
fsspec              2022.8.2
google              NA
h5py                3.7.0
highs_wrapper       NA
igraph              0.9.11
ipykernel           6.15.3
ipython_genutils    0.2.0
ipywidgets          8.0.2
jax                 0.3.17
jaxlib              0.3.15
jedi                0.18.0
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.8.10
llvmlite            0.39.1
louvain             0.8

In [None]:
#original file
adata= scvi.data.read_h5ad('Atlas6.0_3000HVG.h5ad')


In [19]:
#we load the intgeration model from colab

model = scvi.model.SCVI.load(".../Atlas/Models/Model_Atlas6.0_V1", adata=adata, use_gpu=False)

[34mINFO    [0m File [35m/home/kloetzer/Atlas/Models/Model_Atlas6.0_V1/[0m[95mmodel.pt[0m already downloaded      


In [7]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent
adata.layers["scvi_normalized"] = model.get_normalized_expression(
    library_size=10e4)

In [8]:
sc.pp.neighbors(adata, n_pcs=30, use_rep="X_scVI", random_state=seed)
sc.tl.umap(adata, min_dist=0.3, random_state=seed)

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['proj'])
sc.pl.umap(adata, color=['species'])

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']

sc.pl.umap(adata_M, color=['proj'])
sc.pl.umap(adata_R, color=['proj'])
sc.pl.umap(adata_H, color=['proj'])

In [None]:
sc.pl.umap(adata_M, color=['original_annotation'])
sc.pl.umap(adata_M, color=['original_annotation'])

In [None]:
sc.pl.umap(adata, color=['original_annotation'], legend_loc='on data')
sc.pl.umap(adata, color=['original_annotation'])

In [None]:
sc.pl.umap(adata, color=['percent.mt'], vmax=25)
sc.pl.umap(adata, color=['nCount_RNA'], vmax=20000)
sc.pl.umap(adata, color=['nFeature_RNA'], vmax=5000)

In [None]:
sc.set_figure_params(figsize=(40,20))
sc.pl.violin(adata, keys='nFeature_RNA', groupby='proj')
sc.pl.violin(adata, keys='nCount_RNA', groupby='proj')
sc.pl.violin(adata, keys='percent.mt', groupby='proj')

In [None]:
fig, ax = plt.subplots()
sc.pl.violin(adata, keys='nCount_RNA', groupby='proj',
             rotation=90,
             jitter=0.4,ax=ax, show=False)

ax.set_ylim(0, 2000)

plt.show()

In [None]:
fig, ax = plt.subplots()
sc.pl.violin(adata, keys='nFeature_RNA', groupby='proj',
             rotation=90,
             jitter=0.4,ax=ax, show=False)

ax.set_ylim(0, 2000)

plt.show()

In [5]:
#run QC control again and if possible make cut-offs consistent 
adata = adata[adata.obs['percent.mt'] < 15]
adata = adata[adata.obs['nFeature_RNA'] > 300]
adata = adata[adata.obs['nCount_RNA'] > 300]

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['percent.mt'], vmax=15)
sc.pl.umap(adata, color=['nCount_RNA'], vmax=15000)
sc.pl.umap(adata, color=['nFeature_RNA'], vmax=3000)
adata

In [None]:
sc.tl.leiden(adata, key_added="leiden_scVI_0_3", resolution=0.3, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_0_5", resolution=0.5, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_0_7", resolution=0.7, random_state=seed)

In [None]:
sc.tl.leiden(adata, key_added="leiden_scVI_3_0", resolution=3.0, random_state=seed)

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['leiden_scVI_0_3'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_0_5'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_0_7'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_3_0'], legend_loc='on data')

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']

sc.pl.umap(adata_M, color=['proj'])
sc.pl.umap(adata_R, color=['proj'])
sc.pl.umap(adata_H, color=['proj'])
sc.pl.umap(adata_M, color=['leiden_scVI_0_7'])
sc.pl.umap(adata_R, color=['leiden_scVI_0_7'])
sc.pl.umap(adata_H, color=['leiden_scVI_0_7'])
sc.pl.umap(adata, color=['leiden_scVI_0_7'], legend_loc = "on data")

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_3']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_5']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_7']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:10]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_3_0']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
sc.set_figure_params(figsize=(40,10))
sc.pl.violin(adata, keys='nFeature_RNA', groupby='leiden_scVI_0_7')
sc.pl.violin(adata, keys='nCount_RNA', groupby='leiden_scVI_0_7')
sc.pl.violin(adata, keys='percent.mt', groupby='leiden_scVI_0_7')

In [None]:
#remove clusters
clusters_remove=['0', '18', '22', '23', '24', '25']
adata = adata[~adata.obs['leiden_scVI_0_7'].isin(clusters_remove),:]
adata

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['leiden_scVI_0_7'], legend_loc='on data')

In [35]:
cell_names = adata.obs_names
df = pd.DataFrame(cell_names)
df.to_csv('.../Atlas6.0_V1_cleaned.csv', columns=[0], header=True)

In [None]:
# cell identifier are used to subset the raw mergend anndata object before the next integration run