In [None]:
#Atlas 6.1 - cleaned after 6.0

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata
import scvi
import scanpy as sc
import matplotlib
import csv
import os
seed=10
os.chdir(".../Atlas/objects")
sc.logging.print_versions()

In [None]:
#original file
adata= scvi.data.read_h5ad('Atlas6.1_3000HVG.h5ad')
adata

In [None]:
model = scvi.model.SCVI.load(".../Atlas/Models/Model_Atlas6.1_V2", adata=adata, use_gpu=False)

In [None]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent
adata.layers["scvi_normalized"] = model.get_normalized_expression(
    library_size=10e4)

In [None]:
sc.pp.neighbors(adata, n_pcs=30, use_rep="X_scVI", random_state=seed)
sc.tl.umap(adata, min_dist=0.3, random_state=seed)

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']

sc.pl.umap(adata_M, color=['proj'])
sc.pl.umap(adata_R, color=['proj'])
sc.pl.umap(adata_H, color=['proj'])

In [None]:
sc.pl.umap(adata_M, color=['original_annotation'])
sc.pl.umap(adata_M, color=['original_annotation'])

In [None]:
sc.pl.umap(adata, color=['original_annotation'], legend_loc='on data')
sc.pl.umap(adata, color=['original_annotation'])

In [None]:
sc.pl.umap(adata, color=['percent.mt'], vmax=25)
sc.pl.umap(adata, color=['nCount_RNA'], vmax=20000)
sc.pl.umap(adata, color=['nFeature_RNA'], vmax=5000)

In [None]:
sc.set_figure_params(figsize=(40,20))
sc.pl.violin(adata, keys='nFeature_RNA', groupby='proj')
sc.pl.violin(adata, keys='nCount_RNA', groupby='proj')
sc.pl.violin(adata, keys='percent.mt', groupby='proj')

In [None]:
fig, ax = plt.subplots()
sc.pl.violin(adata, keys='nCount_RNA', groupby='proj',
             rotation=90,
             jitter=0.4,ax=ax, show=False)

ax.set_ylim(0, 2000)

plt.show()

In [None]:
fig, ax = plt.subplots()
sc.pl.violin(adata, keys='nFeature_RNA', groupby='proj',
             rotation=90,
             jitter=0.4,ax=ax, show=False)

ax.set_ylim(0, 2000)

plt.show()

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['percent.mt'], vmax=15)
sc.pl.umap(adata, color=['nCount_RNA'], vmax=15000)
sc.pl.umap(adata, color=['nFeature_RNA'], vmax=3000)
adata

In [None]:
sc.tl.leiden(adata, key_added="leiden_scVI_0_3", resolution=0.3, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_0_5", resolution=0.5, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_0_7", resolution=0.7, random_state=seed)

In [None]:
sc.tl.leiden(adata, key_added="leiden_scVI_1_0", resolution=1.0, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_1_5", resolution=1.5, random_state=seed)
sc.tl.leiden(adata, key_added="leiden_scVI_2_0", resolution=2.0, random_state=seed)

In [None]:
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=['leiden_scVI_0_3'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_0_5'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_0_7'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_1_0'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_1_5'], legend_loc='on data')
sc.pl.umap(adata, color=['leiden_scVI_2_0'], legend_loc='on data')

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']
sc.pl.umap(adata_M, color=['leiden_scVI_1_5'])
sc.pl.umap(adata_R, color=['leiden_scVI_1_5'])
sc.pl.umap(adata_H, color=['leiden_scVI_1_5'])

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']

sc.pl.umap(adata_M, color=['proj'])
sc.pl.umap(adata_R, color=['proj'])
sc.pl.umap(adata_H, color=['proj'])
sc.pl.umap(adata_M, color=['leiden_scVI_2_0'])
sc.pl.umap(adata_R, color=['leiden_scVI_2_0'])
sc.pl.umap(adata_H, color=['leiden_scVI_2_0'])
sc.pl.umap(adata, color=['leiden_scVI_2_0'], legend_loc = "on data")

In [None]:
adata_M=adata[adata.obs.species=='mouse']
adata_R=adata[adata.obs.species=='rat']
adata_H=adata[adata.obs.species=='human']

sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata_M, color=['condition_harmonized'])
sc.pl.umap(adata_R, color=['condition_harmonized'])
sc.pl.umap(adata_H, color=['condition_harmonized'])

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_3']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_5']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_0_7']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:10]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_1_0']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_1_5']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_2_0']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_1_5']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:10]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['leiden_scVI_1_0']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:10]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
sc.set_figure_params(figsize=(40,10))
sc.pl.violin(adata, keys='nFeature_RNA', groupby='leiden_scVI_1_5')
sc.pl.violin(adata, keys='nCount_RNA', groupby='leiden_scVI_1_5')
sc.pl.violin(adata, keys='percent.mt', groupby='leiden_scVI_1_5')

In [None]:
sc.set_figure_params(figsize=(40,10))
sc.pl.violin(adata, keys='nFeature_RNA', groupby='leiden_scVI_2_0')
sc.pl.violin(adata, keys='nCount_RNA', groupby='leiden_scVI_2_0')
sc.pl.violin(adata, keys='percent.mt', groupby='leiden_scVI_2_0')

In [None]:
PTS1 = ['SLC6A19', 'SLC5A12', 'PRODH2', 'NOX4', 'SLC34A1']

sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=PTS1)

In [None]:
PTS2 = ['SLC34A1', 'SLC13A3', 'ACMSD', 'MIOX', 'PAH', 'SLC5A10']

sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=PTS2)

In [None]:
PTS3 = ['SLC7A13', 'GRAMD1B', 'ACSS2', 'SLC23A1', 'SLC6A18']

sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata, color=PTS3)



In [None]:
#test
clusters_remove=['32']
adata_test = adata[~adata.obs['leiden_scVI_1_5'].isin(clusters_remove),:]
sc.set_figure_params(figsize=(5,5))
sc.pl.umap(adata_test, color="percent.mt")

In [None]:
sc.pl.umap(adata_test, color="leiden_scVI_2_0")

In [None]:
adata.obs['annotation_Atlas_level1']=adata.obs.leiden_scVI_1_5
Rename_cluster = {
"0": 'PTS1_PTS2',
"1": 'injPT',
"2": 'PTS1_PTS2',
"3": 'PTS1_PTS2',
"4": 'DCT',
"5": 'PTS3',
"6": 'EC',
"7": 'CNT',
"8": 'TAL',
"9": 'Stromal',
"10": 'ICA',
"11": 'TAL',
"12": 'TAL',
"13": 'CD_PC',
"14": 'Immune',
"15": 'DTL_ATL',
"16": 'TAL',
"17": 'EC',
"18": 'ICB',
"19": 'DCT_CNT',
"20": 'PEC',
"21": 'CD_Pap',
"22": 'Immune',
"23": 'Podo',
"24": 'Stromal',
"25": 'EC',
"26": 'unknown1',
"27": 'EC',
"28": 'PT_prolif',
"29": 'injPT',
"30": 'Immune',
"31": 'PTS1_PTS2',
"32": 'PTS1_PTS2',
"33": 'unknown2',
"34": 'EC',
"35": 'injPT',
"36": 'EC',
"37": 'Stromal',
"38": 'Stromal',
"39": 'CD_PC',
"40": 'Stromal',
}
adata.obs['annotation_Atlas_level1'] = adata.obs['annotation_Atlas_level1'].replace(Rename_cluster).astype('category')



In [None]:
sc.set_figure_params(figsize=(5,5))

sc.pl.umap(adata, color="annotation_Atlas_level1")
sc.pl.umap(adata, color="annotation_Atlas_level1", legend_loc = 'on data')

In [None]:
#set clustering of interest
adata.obs['leiden'] = adata.obs['annotation_Atlas_level1']
#get marker genes
de_df = model.differential_expression(
    groupby="leiden")
de_df.head()
markers = {}
cats = adata.obs.leiden.cat.categories
for i, c in enumerate(cats):
    cid = "{} vs Rest".format(c)
    leiden_scVI_df = de_df.loc[de_df.comparison == cid]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df.lfc_mean > 0]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["bayes_factor"] > 3]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["non_zeros_proportion1"] > 0.1]
    leiden_scVI_df = leiden_scVI_df[leiden_scVI_df["lfc_mean"] > 0.75]
    markers[c] = leiden_scVI_df.index.tolist()[:5]
    sc.tl.dendrogram(adata, groupby="leiden", use_rep="X_scVI")
    sc.pl.dotplot(
    adata,
    markers,
    groupby='leiden',
    dendrogram=True,
    color_map="Blues",
    swap_axes=True,
    use_raw=True,
    standard_scale="var",
)

In [None]:
clusters_remove=['unknown1']
adata = adata[~adata.obs['annotation_Atlas_level1'].isin(clusters_remove),:]
adata

In [None]:
sc.set_figure_params(figsize=(5,5))

sc.pl.umap(adata, color="annotation_Atlas_level1")
sc.pl.umap(adata, color="annotation_Atlas_level1", legend_loc = 'on data')

In [None]:
adata.obs['annotation_Atlas_level2']=adata.obs.leiden_scVI_2_0
Rename_cluster = {
"0": 'PTS1_PTS2',
"1": 'injPT',
"2": 'PTS1_PTS2',
"3": 'PTS1_PTS2',
"4": 'DCT',
"5": 'PTS3',
"6": 'EC',
"7": 'CNT',
"8": 'TAL',
"9": 'Stromal',
"10": 'ICA',
"11": 'TAL',
"12": 'TAL',
"13": 'CD_PC',
"14": 'Immune',
"15": 'DTL_ATL',
"16": 'TAL',
"17": 'EC',
"18": 'ICB',
"19": 'DCT_CNT',
"20": 'PEC',
"21": 'CD_Pap',
"22": 'Immune',
"23": 'Podo',
"24": 'Stromal',
"25": 'EC',
"26": 'unknown1',
"27": 'EC',
"28": 'PT_prolif',
"29": 'injPT',
"30": 'Immune',
"31": 'PTS1_PTS2',
"32": 'PTS1_PTS2',
"33": 'unknown2',
"34": 'EC',
"35": 'injPT',
"36": 'EC',
"37": 'Stromal',
"38": 'Stromal',
"39": 'CD_PC',
"40": 'Stromal',
}
adata.obs['annotation_Atlas_level2'] = adata.obs['annotation_Atlas_level2'].replace(Rename_cluster).astype('category')



In [None]:
#subset clusters of interest - PT
path_subset = '.../Atlas/Cellnames/Atlas6/Atlas6.1_subset_PT'
clusters_subset=['PTS1_PTS2', 'injPT', 'PTS3', 'PEC', 'PT_prolif']
adata_subset = adata[adata.obs['annotation_Atlas_level1'].isin(clusters_subset),:] 
sc.pl.umap(adata_subset, color=['annotation_Atlas_level1'], legend_loc='on data')

cell_names = adata_subset.obs_names
df = pd.DataFrame(cell_names)
df.to_csv(path_subset, columns=[0], header=True)

In [None]:

#subset clusters of interest - Immune
path_subset = '.../Atlas/Cellnames/Atlas6/Atlas6.1_subset_Immune'
clusters_subset=['Immune']
adata_subset = adata[adata.obs['annotation_Atlas_level1'].isin(clusters_subset),:] 
sc.pl.umap(adata_subset, color=['annotation_Atlas_level1'], legend_loc='on data')

cell_names = adata_subset.obs_names
df = pd.DataFrame(cell_names)
df.to_csv(path_subset, columns=[0], header=True)


In [None]:

#subset clusters of interest - ECs
path_subset = '.../Atlas/Cellnames/Atlas6/Atlas6.1_subset_EC'
clusters_subset=['EC']
adata_subset = adata[adata.obs['annotation_Atlas_level1'].isin(clusters_subset),:] 
sc.pl.umap(adata_subset, color=['annotation_Atlas_level1'], legend_loc='on data')

cell_names = adata_subset.obs_names
df = pd.DataFrame(cell_names)
df.to_csv(path_subset, columns=[0], header=True)

In [None]:
#save progress here and reload 
adata.write("Atlas6.1_3000HVG_integrated_V2.h5ad")