# Single-Cell-Reciter

## Import the libraries

In [None]:
import os
import anndata
import scvi
import logging
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import matplotlib.pyplot as plt

from glob import glob
from pathlib import Path

## Settings

### scvi settings

In [None]:
scvi.settings.seed = 1

In [None]:
scvi.settings.progress_bar_style = "rich"

In [None]:
scvi.settings.verbosity = logging.INFO

In [None]:
scvi.settings.dl_pin_memory_gpu_training = True

In [None]:
scvi.settings.num_threads = 20

In [None]:
scvi.settings.jax_preallocate_gpu_memory = False

### scanpy settings

In [None]:
sc.settings.verbosity = 3

In [None]:
sc.settings.set_figure_params(dpi=150)

### Output directory

In [None]:
Result_dir = "Results/"
Path(Result_dir).mkdir(parents=True, exist_ok=True)

results_file = Result_dir + 'SCC.h5ad'  # the file that will store the analysis results

## Load datasets

### Check local directory

In [None]:
!ls ../filtered_feature_bc_matrix_h5/*

### SCC Datasets

In [None]:
data_SCC_P8_1 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P8_1_cSCC.h5")
data_SCC_P8_1.var_names_make_unique()
data_SCC_P8_1.obs['type'] = 'SCC'
data_SCC_P8_1.obs['sample'] = 'SCC_P8_1'

data_SCC_P8_2 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P8_2_cSCC.h5")
data_SCC_P8_2.var_names_make_unique()
data_SCC_P8_2.obs['type'] = 'SCC'
data_SCC_P8_2.obs['sample'] = 'SCC_P8_2'

data_SCC_P7 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P7_cSCC.h5")
data_SCC_P7.var_names_make_unique()
data_SCC_P7.obs['type'] = 'SCC'
data_SCC_P7.obs['sample'] = 'SCC_P7'

data_SCC_P3_1 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P3_1_cSCC.h5")
data_SCC_P3_1.var_names_make_unique()
data_SCC_P3_1.obs['type'] = 'SCC'
data_SCC_P3_1.obs['sample'] = 'SCC_P3_1'

data_SCC_P2 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P2_cSCC.h5")
data_SCC_P2.var_names_make_unique()
data_SCC_P2.obs['type'] = 'SCC'
data_SCC_P2.obs['sample'] = 'SCC_P2'

data_SCC_P5 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P5_cSCC.h5")
data_SCC_P5.var_names_make_unique()
data_SCC_P5.obs['type'] = 'SCC'
data_SCC_P5.obs['sample'] = 'SCC_P5'

data_SCC_P3_2 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P3_2_cSCC.h5")
data_SCC_P3_2.var_names_make_unique()
data_SCC_P3_2.obs['type'] = 'SCC'
data_SCC_P3_2.obs['sample'] = 'SCC_P3_2'

data_SCC_P4 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P4_cSCC.h5")
data_SCC_P4.var_names_make_unique()
data_SCC_P4.obs['type'] = 'SCC'
data_SCC_P4.obs['sample'] = 'SCC_P4'

data_SCC_P1_2 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P1_2_cSCC.h5")
data_SCC_P1_2.var_names_make_unique()
data_SCC_P1_2.obs['type'] = 'SCC'
data_SCC_P1_2.obs['sample'] = 'SCC_P1_2'

data_SCC_P9 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P9_cSCC.h5")
data_SCC_P9.var_names_make_unique()
data_SCC_P9.obs['type'] = 'SCC'
data_SCC_P9.obs['sample'] = 'SCC_P9'

data_SCC_P10 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P10_cSCC.h5")
data_SCC_P10.var_names_make_unique()
data_SCC_P10.obs['type'] = 'SCC'
data_SCC_P10.obs['sample'] = 'SCC_P10'

data_SCC_P1_1 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P1_1_cSCC.h5")
data_SCC_P1_1.var_names_make_unique()
data_SCC_P1_1.obs['type'] = 'SCC'
data_SCC_P1_1.obs['sample'] = 'SCC_P1_1'

data_SCC_P6 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P6_cSCC.h5")
data_SCC_P6.var_names_make_unique()
data_SCC_P6.obs['type'] = 'SCC'
data_SCC_P6.obs['sample'] = 'SCC_P6'

### Normal Datasets

In [None]:
data_nrl_P4 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P4.h5")
data_nrl_P4.var_names_make_unique()
data_nrl_P4.obs['type'] = 'Normal'
data_nrl_P4.obs['sample'] = 'nrl_P4'

data_nrl_P1_1 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P1_1.h5")
data_nrl_P1_1.var_names_make_unique()
data_nrl_P1_1.obs['type'] = 'Normal'
data_nrl_P1_1.obs['sample'] = 'nrl_P1_1'

data_nrl_P10 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P10.h5")
data_nrl_P10.var_names_make_unique()
data_nrl_P10.obs['type'] = 'Normal'
data_nrl_P10.obs['sample'] = 'nrl_P10'

data_nrl_P3 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P3.h5")
data_nrl_P3.var_names_make_unique()
data_nrl_P3.obs['type'] = 'Normal'
data_nrl_P3.obs['sample'] = 'nrl_P3'

data_nrl_P2 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P2.h5")
data_nrl_P2.var_names_make_unique()
data_nrl_P2.obs['type'] = 'Normal'
data_nrl_P2.obs['sample'] = 'nrl_P2'

data_nrl_P9 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P9.h5")
data_nrl_P9.var_names_make_unique()
data_nrl_P9.obs['type'] = 'Normal'
data_nrl_P9.obs['sample'] = 'nrl_P9'

data_nrl_P8 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P8.h5")
data_nrl_P8.var_names_make_unique()
data_nrl_P8.obs['type'] = 'Normal'
data_nrl_P8.obs['sample'] = 'nrl_P8'

data_nrl_P7 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P7.h5")
data_nrl_P7.var_names_make_unique()
data_nrl_P7.obs['type'] = 'Normal'
data_nrl_P7.obs['sample'] = 'nrl_P7'

data_nrl_P6 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P6.h5")
data_nrl_P6.var_names_make_unique()
data_nrl_P6.obs['type'] = 'Normal'
data_nrl_P6.obs['sample'] = 'nrl_P6'

data_nrl_P5 = sc.read_10x_h5("../filtered_feature_bc_matrix_h5/P5.h5")
data_nrl_P5.var_names_make_unique()
data_nrl_P5.obs['type'] = 'Normal'
data_nrl_P5.obs['sample'] = 'nrl_P5'

### Merge Datasets

In [None]:
%%time

adata = data_SCC_P1_1.concatenate(data_SCC_P10, data_SCC_P1_2, data_SCC_P2, data_SCC_P3_1, data_SCC_P3_2, 
                                  data_SCC_P4, data_SCC_P5, data_SCC_P6, data_SCC_P7, data_SCC_P8_1, 
                                  data_SCC_P8_2, data_SCC_P9, data_nrl_P10, data_nrl_P1_1, data_nrl_P2, 
                                  data_nrl_P3, data_nrl_P4, data_nrl_P5, data_nrl_P6, data_nrl_P7, 
                                  data_nrl_P8, data_nrl_P9)

# and delete individual datasets to save space
del(data_SCC_P10, data_SCC_P1_1, data_SCC_P1_2, data_SCC_P2, 
    data_SCC_P3_1, data_SCC_P3_2, data_SCC_P4, data_SCC_P5, 
    data_SCC_P6, data_SCC_P7, data_SCC_P8_1, data_SCC_P8_2, 
    data_SCC_P9, data_nrl_P10, data_nrl_P1_1, data_nrl_P2, 
    data_nrl_P3, data_nrl_P4, data_nrl_P5, data_nrl_P6, 
    data_nrl_P7, data_nrl_P8, data_nrl_P9)


### Save merged dataset

In [None]:
adata.write(results_file)

In [None]:
print(adata.obs['sample'].value_counts())

adata

## Calculate QC

### Settings

In [None]:
sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

### Data

In [None]:
adata = sc.read("Results/SCC.h5ad")
adata

In [None]:
print(adata.obs['sample'].value_counts())

adata

### Analysis

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

In [None]:
adata.layers["counts"] = adata.X.copy()

## Amortized LDA

In [None]:
n_topics = 10

scvi.model.AmortizedLDA.setup_anndata(adata, layer = "counts")
model = scvi.model.AmortizedLDA(adata, n_topics = n_topics)

In [None]:
scvi.model.AmortizedLDA(adata, n_topics = n_topics)

In [None]:
model.train()

In [None]:
model.save("./Results/SCC_Topic_model/")

In [None]:
topic_prop = model.get_latent_representation()
topic_prop.head()

In [None]:
topic_prop

In [None]:
topic_prop.to_parquet("./Results/parquets/topic_prop")

## UMAP analyses

In [None]:
# Save topic proportions in obsm and obs columns.
adata.obsm["X_LDA"] = topic_prop
for i in range(n_topics):
    adata.obs[f"LDA_topic_{i}"] = topic_prop[[f"topic_{i}"]]

### Raw counts umap

In [None]:
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added = "leiden_scVI", resolution = 0.8)

# Save UMAP to custom .obsm field.
adata.obsm["raw_counts_umap"] = adata.obsm["X_umap"].copy()

In [None]:
sc.pl.embedding(adata, "raw_counts_umap", color = ["leiden_scVI"], frameon=False)

In [None]:
sc.pl.embedding(adata, "raw_counts_umap", color = [f"LDA_topic_{i}" for i in range(n_topics)], frameon=False)

### Topic space UMAP

In [None]:
sc.pp.neighbors(adata, use_rep="X_LDA", n_neighbors = 20, metric="hellinger")
sc.tl.umap(adata)

# Save UMAP to custom .obsm field.
adata.obsm["topic_space_umap"] = adata.obsm["X_umap"].copy()

In [None]:
sc.pl.embedding(adata, "topic_space_umap", color = [f"LDA_topic_{i}" for i in range(n_topics)], frameon=False)

### Feature by topic

In [None]:
feature_by_topic = model.get_feature_by_topic()
feature_by_topic.head()

In [None]:
feature_by_topic.to_parquet("./Results/parquets/feature_by_topic")

### Rank by topic

In [None]:
rank_by_topic = pd.DataFrame()
for i in range(n_topics):
    topic_name = f"topic_{i}"
    topic = feature_by_topic[topic_name].sort_values(ascending=False)
    rank_by_topic[topic_name] = topic.index
    rank_by_topic[f"{topic_name}_prop"] = topic.values

In [None]:
rank_by_topic.head()

In [None]:
rank_by_topic.to_parquet("./Results/parquets/rank_by_topic")