# Clustering of spots to discover spatial regions of interest

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
from scipy.io import mmread
sc.settings.n_jobs=4
sc.set_figure_params(figsize=(6, 6), frameon=False)


### List all slides

In [None]:
# create a dictionary of data paths
data_dir_dict = {
    # adjacent normal
    'HCC_1N' : './raw_data/Adjacent/HCC-1N',
    'HCC_2N' : './raw_data/Adjacent/HCC-2N',
    'HCC_3N' : './raw_data/Adjacent/HCC-3N',
    'HCC_4N' : './raw_data/Adjacent/HCC-4N',
    'cHC_1N' : './raw_data/Adjacent/cHC-1N',

    # leading edge
    'HCC_1L' : './raw_data/Leading_Edge/HCC-1L',
    'HCC_2L' : './raw_data/Leading_Edge/HCC-2L',
    'HCC_3L' : './raw_data/Leading_Edge/HCC-3L',
    'HCC_4L' : './raw_data/Leading_Edge/HCC-4L',
    'cHC_1L' : './raw_data/Leading_Edge/cHC-1L',
    'iCC_1L' : './raw_data/Leading_Edge/ICC-1L',

    # primary tumor
    'HCC_1T' : './raw_data/Primary_Tumor/HCC-1T',
    'HCC_2T' : './raw_data/Primary_Tumor/HCC-2T',
    'HCC_3T' : './raw_data/Primary_Tumor/HCC-3T',
    'HCC_4T' : './raw_data/Primary_Tumor/HCC-4T',
    'cHC_1T' : './raw_data/Primary_Tumor/cHC-1T'
}

In [None]:
plt.rcParams['figure.dpi'] = 80
plt.rcParams['savefig.dpi'] = 80

In [None]:
# Read in all slide and store in a dict
adata_dict = {}
for key, value in data_dir_dict.items():
    adata = sc.read_visium(value)
    adata.var_names_make_unique()
    adata.obs['sample'] = key
    # add slide name to obs index
    adata.obs.index = key + '_' + adata.obs.index
    adata_dict[key] = adata

In [None]:
# print out the number of genes and cells in each slide
for key, value in adata_dict.items():
    print(key, value.shape)

### Plot spatial images of all slides

In [None]:
# plot images from each visium in a 4x4 grid
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
for i, (key, value) in enumerate(adata_dict.items()):
    sc.pl.spatial(value, img_key="hires", ax=axes[i//4, i%4], alpha_img=1,show=False)
    axes[i//4, i%4].set_title(key)

### Qaulity control of spots
1. remove genes with low counts
2. remove spots with low counts
3. select highly variable genes
4. PCA
5. Leiden clustering
6. UMAP
7. Harmony integration
8. Leiden clustering
9. UMAP

In [None]:
# label mitochondria genes and perform basic QC
for key, value in adata_dict.items():
    adata = adata_dict[key]
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)

In [None]:
# plot QC metrics
for key, value in adata_dict.items():
    adata = adata_dict[key]
    print(key)
    sc.pl.violin(adata, ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
                 jitter=0.0, multi_panel=True, show=False)
    sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"])


# Concatenate all slides


In [None]:
adata = ad.concat(adata_dict.values())
adata.shape

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)

### Some standard QC cleaning

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=10)
adata = adata[adata.obs.pct_counts_mt < 15, :].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata, base=2)
print(adata.shape)

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
adata.shape

In [None]:
# perform PCA
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=50)
sc.tl.leiden(adata, resolution=0.1250, key_added="leiden")
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=["leiden", "sample"],  show=False)

### Display cluster according to sample type

In [None]:
# find samples ending with 'N' and plot umap
adata_n = adata[adata.obs.sample.str.endswith('N'), :]
sc.pl.umap(adata_n, color=["leiden", "sample"],  show=False)
adata_l = adata[adata.obs.sample.str.endswith('L'), :]
sc.pl.umap(adata_l, color=["leiden", "sample"],  show=False)
adata_t = adata[adata.obs.sample.str.endswith('T'), :]
sc.pl.umap(adata_t, color=["leiden", "sample"],  show=False)

### Remove batch effect using Harmony

In [None]:
# perform harmony batch correction, label the batch as sample
sc.external.pp.harmony_integrate(adata, key="sample")

In [None]:
# reclustering after batch correction
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=50, use_rep="X_pca_harmony")
sc.tl.leiden(adata, resolution=0.5, key_added="leiden_harmony")
sc.tl.umap(adata, min_dist=0.3)


In [None]:

sc.pl.umap(adata, color=["leiden_harmony", "sample"],  show=False)

### Add cell cluster label to each slides 

In [None]:
for key, value in adata_dict.items():
    adata_dict[key].obs['leiden_harmony'] = adata.obs['leiden_harmony'][adata.obs['sample'].str.contains(key)]
    # sc.pl.spatial(adata, img_key="hires", color=["leiden_harmony"], show=False)

### plot spatial images of all slides with cell cluster label

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
for i, (key, value) in enumerate(adata_dict.items()):
    sc.pl.spatial(value, img_key="hires", color=["leiden_harmony"], ax=axes[i//4, i%4], alpha_img=1,show=False)
    axes[i//4, i%4].set_title(key)