In [None]:
import numpy as np
import pandas as pd
import pathlib
import scanpy as sc
import scvi
import seaborn as sns

In [None]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
SAMPLE_RAW_DATA_PATH = DATA_DIR / 'raw_data' / 'GSM5226574_C51ctr_raw_counts.csv.gz'

In [None]:
adata = sc.read_csv(SAMPLE_RAW_DATA_PATH).T
adata

## Doublet removal

In [None]:
# Filter genes expressed in less than 10 cells
sc.pp.filter_genes(adata, min_cells=10)

In [None]:
# Keep only top 2000 highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True, flavor='seurat_v3')

In [None]:
scvi.model.SCVI.setup_anndata(adata)
model = scvi.model.SCVI(adata)
model.train()

In [None]:
solo = scvi.external.SOLO.from_scvi_model(model, adata)
solo.train()

In [None]:
df = solo.predict()
df['prediction'] = solo.predict(soft = False)

df.index = df.index.map(lambda x: x[:-2])

df

In [None]:
df.groupby('prediction').count()

In [None]:
df['dif'] = df.doublet - df.singlet
df

In [None]:
sns.displot(df[df.prediction == 'doublet'], x = 'dif')

In [None]:
doublets = df[(df.prediction == 'doublet') & (df.dif > 1)]
doublets

In [None]:
adata = sc.read_csv(SAMPLE_RAW_DATA_PATH).T
adata

In [None]:
adata.obs.index = adata.obs.index.map(lambda x: x[:-2])
adata.obs['doublet'] = adata.obs.index.isin(doublets.index)

In [None]:
adata.obs

In [None]:
adata = adata[~adata.obs.doublet]

In [None]:
adata

## Preprocessing

In [None]:
adata.var['mt'] = adata.var.index.str.startswith('MT-')

In [None]:
# Endpoint to retrieve the list of ribosomal genes was returning 404,
# so I manually copied the list from the tutorial
ribo_genes = ['FAU','MRPL13', 'RPL10', 'RPL10A','RPL10L', 'RPL11', 'RPL12', \
 'RPL13', 'RPL13A', 'RPL14', 'RPL15', 'RPL17', 'RPL18', 'RPL18A', \
 'RPL19', 'RPL21', 'RPL22', 'RPL22L1', 'RPL23', 'RPL23A', 'RPL24', \
 'RPL26', 'RPL26L1', 'RPL27', 'RPL27A', 'RPL28', 'RPL29', 'RPL3', \
 'RPL30', 'RPL31', 'RPL32', 'RPL34', 'RPL35', 'RPL35A', 'RPL36', \
 'RPL36A', 'RPL36AL', 'RPL37', 'RPL37A', 'RPL38', 'RPL39','RPL3L', \
 'RPL4', 'RPL41', 'RPL5', 'RPL6', 'RPL7', 'RPL7A', 'RPL8', 'RPL9', \
 'RPLP0', 'RPLP1', 'RPLP2', 'RPS10', 'RPS11', 'RPS12', 'RPS13', \
 'RPS15', 'RPS15A', 'RPS16', 'RPS17', 'RPS18', 'RPS19', 'RPS2', \
 'RPS20', 'RPS21', 'RPS23', 'RPS24', 'RPS25', 'RPS26', 'RPS27', \
 'RPS27A', 'RPS27L', 'RPS28', 'RPS29', 'RPS3', 'RPS3A', 'RPS4X', \
 'RPS4Y1', 'RPS5', 'RPS6', 'RPS7', 'RPS8', 'RPS9', 'RPSA', 'RSL24D1', \
 'RPSL24D1P11', 'UBA52']

ribo_genes = np.array(ribo_genes)

In [None]:
adata.var['ribo'] = adata.var.index.isin(ribo_genes)

In [None]:
adata.var[adata.var.ribo == True]

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
adata.obs

In [None]:
adata.var.sort_values('n_cells_by_counts')

In [None]:
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.obs.sort_values('total_counts')

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'], jitter=0.4, multi_panel=True)