In [None]:
import numpy as np
import os
import requests
import sys
sys.path.insert(1, '../')
import scanpy as sc
from scipy.sparse import csr_matrix
import scvi
from utils import pp

In [None]:
response = requests.get('https://www.gsea-msigdb.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=json').json()
ribo_genes = response["KEGG_RIBOSOME"]['geneSymbols']

ribo_genes = np.array(ribo_genes)

In [None]:
files = os.listdir('../data/raw_data/')
len(files)


In [None]:
out = []

for file in files:
    adata = pp('../data/raw_data/' + file, ribo_genes)
    out.append(adata)

In [None]:
adata = sc.concat(out)
out = None
sc.pp.filter_genes(adata, min_cells=10)
adata.X = csr_matrix(adata.X)
adata.X

In [None]:
adata.write_h5ad('../data/processed_data/adata.h5ad')

In [None]:
adata.obs.groupby('Sample').count()

In [None]:
sc.pp.filter_genes(adata, min_cells=100)
adata

In [None]:
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

In [None]:
scvi.model.SCVI.setup_anndata(adata,
                              layer="counts",
                              categorical_covariate_keys=["Sample"],
                              categorical_covariate_keys=["pct_counts_mt", "total_counts", "pct_counts_ribo"])

model = scvi.model.SCVI(adata)
model.train()

In [None]:
adata.obsm['X_scVI'] = model.get_latent_representation()
adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size=1e4)

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI')
sc.tl.umap(adata)
sc.tl.leiden(adata, color=['leiden', 'Sample'], frameon=False)
sc.pl.umap(adata, color=['leiden', 'Sample'], frameon=False)

In [None]:
adata.write_h5ad('../data/processed_data/scvi_integrated_data.h5ad')

## Find markers and label cell types

In [None]:
sc.tl.leiden(adata, resolution=0.9) # Inscreasing resolution to separate CD4+ T cells and CD8+ T cells

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden')

In [None]:
#sc.pl.rank_genes_groups(adata, n_genes=20, shharey=False)

In [None]:
markers = sc.get.rank_genes_groups_df(adata, None)
markers = markers[(markers.pvals_adj < 0.05) & (markers.logfoldchanges > .5)]
markers

In [None]:
markers_scvi = model.differential_expression(groupby='leiden')
markers_scvi

In [None]:
markers_scvi = markers_scvi[(markers_scvi['is_de_fdr_0.05']) & (markers_scvi.lfc_mean > .5)]
markers_scvi

In [None]:
sc.pl.umap(adata, color = ['EPCAM', 'MUC1'], frameon = False, layer = 'scvi_normalized', vmax = 5)
#, layer = 'scvi_normalized'

In [None]:
cell_type = {"0":"Macrophage",
"1":"Fibroblast",
"2":"CD4+ T-cell",
"3":"AT2",
"4":"AT1",
"5":"CD8+ T-cell",
"6":"Endothelial cell",
"7":"Plasma cell",
"8":"Macrophage",
"9":"AT2",
"10":"Fibroblast",
"11":"Fibroblast",
"12":"Macrophage",
"13":"Macrophage",
"14":"Airway epithelial",
"15":"Airway epithelial",
"16":"Monocyte",
"17":"Airway epithelial",
"18":"B-cell",
"19":"Aerocyte",
"20":"Airway epithelial",
"21":"Smooth muscle cell",
"22":"Cycling T/NK",
"23":"Neuronal cell",
"24":"Denditic cell",
"25":"Pericyte",
"26":"Fibroblast",
"27":"Erythroid-like",
"28":"Macrophage"
}

In [None]:
adata.obs['cell_type'] = adata.obs.leiden.map(cell_type)
sc.pl.umap(adata, color = ['cell type'], frameon = False)

In [None]:
adata.uns['scvi_markers'] = markers_scvi
adata.uns['markers'] = markers
adata.write_h5ad('../data/processed_data/scvi_integrated_data.h5ad')
model.save('model.model')