In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
from scipy.io import mmread
from sklearn.preprocessing import normalize
import anndata as ad
import matplotlib as mpl
from harmony import harmonize
sc.set_figure_params(figsize=(6, 6), frameon=False)
sc.settings.n_jobs=8

In [None]:
TandNK_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_TNK_4T_final.csv', header = None).transpose().values
Stroma_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_4T_final.csv', header = None).transpose().values
B_cell_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_4T_final.csv', header = None).transpose().values
Epithelial_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_4T_final.csv', header = None).transpose().values
Myeloid_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_4T_final.csv', header = None).transpose().values

In [None]:
TandNK_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_TNK_4L_final.csv', header = None).transpose().values
Stroma_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_4L_final.csv', header = None).transpose().values
B_cell_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_4L_final.csv', header = None).transpose().values
Epithelial_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_4L_final.csv', header = None).transpose().values
Myeloid_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_4L_final.csv', header = None).transpose().values

In [None]:
TandNK_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_TNK_4N_final.csv', header = None).transpose().values
Stroma_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_4N_final.csv', header = None).transpose().values
B_cell_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_4N_final.csv', header = None).transpose().values
Epithelial_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_4N_final.csv', header = None).transpose().values
Myeloid_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_4N_final.csv', header = None).transpose().values

In [None]:
GEM_combined_T = np.concatenate([TandNK_ST_inference_T, Stroma_ST_inference_T, B_cell_ST_inference_T, 
                               Epithelial_ST_inference_T, Myeloid_ST_inference_T], axis = 1)
GEM_combined_L = np.concatenate([TandNK_ST_inference_L, Stroma_ST_inference_L, B_cell_ST_inference_L, 
                               Epithelial_ST_inference_L, Myeloid_ST_inference_L], axis = 1)
GEM_combined_N = np.concatenate([TandNK_ST_inference_N, Stroma_ST_inference_N, B_cell_ST_inference_N, 
                               Epithelial_ST_inference_N, Myeloid_ST_inference_N], axis = 1)

In [None]:
spot_n_T = GEM_combined_T.shape[0]
spot_n_L = GEM_combined_L.shape[0]
spot_n_N = GEM_combined_N.shape[0]

In [None]:
GEM_combined = np.concatenate([GEM_combined_T, GEM_combined_L, GEM_combined_N], axis = 0)
GEM_combined.shape

In [None]:
#do normalization vertically
GEM_combined = normalize(GEM_combined, axis=0, norm='max')

In [None]:
adata_combined = ad.AnnData(X=GEM_combined, dtype=np.float64)

In [None]:
#remove batch effects among 4 slides of this patient
#prepare for clustering
def prepare_for_umap(adata, n_comps=50, batch_effects = False, df_metadata = None):
#     sc.pp.normalize_total(adata, inplace=True)
#     print('Finish normalization!')
#     sc.pp.scale(adata)
    sc.pp.pca(adata, n_comps)
    print('Finish PCA!')
    
    if batch_effects:
        X_PCA = adata.obsm['X_pca'].copy()
        print("Processing shape: ", X_PCA.shape)
        corr_PCA = harmonize(X_PCA, df_metadata, batch_key = 'Sample ID')
        print("Finish correction!")
        adata.obsm['X_pca'] = corr_PCA
        
    sc.pp.neighbors(adata, use_rep = 'X_pca')
    sc.tl.umap(adata)

In [None]:
df_metadata = ['4T' for i in range(spot_n_T)] + ['4L' for i in range(spot_n_L)] + ['4N' for i in range(spot_n_N)]
df_metadata = pd.DataFrame(df_metadata, columns = ['Sample ID'])

In [None]:
prepare_for_umap(adata_combined, n_comps=50, batch_effects = True, df_metadata = df_metadata)

In [None]:
sc.tl.leiden(adata_combined, key_added="clusters", resolution = 0.2)

In [None]:
palette={
    "0": "red",
    "1": "pink",
    "2": "blue",
    "3": "yellow",
    "4": "purple",
    "5": "brown",
    "6": "green",
    "7": "orange",
    "8": "indigo",
    "9": "black",
    "10": "Cyan",
    "11": "Lime",
    "12": "Beige",
    "13":"white",
    "14":"Gold",
    "15": "Gray",
    "16": "Aqua",
}

In [None]:
adata_combined.obs['batch_id'] = df_metadata.values

In [None]:
sc.tl.umap(adata_combined, min_dist = 0.3, spread = 1)

# show regions in UMAP coordinates
with mpl.rc_context({'axes.facecolor':  'white',
                     'figure.figsize': [8, 8]}):
    sc.pl.umap(adata_combined, color=['clusters'], size=30,
               color_map = 'RdPu', ncols = 2, legend_loc='on data',
               legend_fontsize=20, palette = palette)

In [None]:
HCC_1N = './raw_data/Adjacent/HCC-1N'
HCC_2N = './raw_data/Adjacent/HCC-2N'
HCC_3N = './raw_data/Adjacent/HCC-3N'
HCC_4N = './raw_data/Adjacent/HCC-4N'

HCC_1L = './raw_data/Leading_Edge/HCC-1L'
HCC_2L = './raw_data/Leading_Edge/HCC-2L'
HCC_3L = './raw_data/Leading_Edge/HCC-3L'
HCC_4L = './raw_data/Leading_Edge/HCC-4L'

HCC_1T = './raw_data/Primary_Tumor/HCC-1T'
HCC_2T = './raw_data/Primary_Tumor/HCC-2T'
HCC_3T = './raw_data/Primary_Tumor/HCC-3T'
HCC_4T = './raw_data/Primary_Tumor/HCC-4T'

visium_paths = [HCC_1N, HCC_2N, HCC_3N, HCC_4N, HCC_1L, HCC_2L, HCC_3L, HCC_4L, HCC_1T, HCC_2T, HCC_3T, HCC_4T]
visium_names = ['HCC-'+ str(i + 1) + 'N-Adjacent'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'L-Leading_Edge'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'T-Primary_Tumor'for i in range(4)]

In [None]:
adata_l = []
for i in range(12):
    adata = sc.read_visium(visium_paths[i])
    adata_l.append(adata)

In [None]:
adata_T = adata_l[11]
adata_L = adata_l[7]
adata_N = adata_l[3]

In [None]:
cluster_res = list(adata_combined.obs["clusters"])

In [None]:
adata_T.obs['cluster'] = cluster_res[0 : spot_n_T]
adata_L.obs['cluster'] = cluster_res[spot_n_T : spot_n_T + spot_n_L] 
adata_N.obs['cluster'] = cluster_res[spot_n_T + spot_n_L : ]

In [None]:
#plot raw image and clustering results
fig, ax = plt.subplots(2,3, figsize=(20,10))
sc.pl.spatial(adata_T, img_key="hires", alpha_img = 1, ax = ax[0, 2], show = False, title = 'HCC-4T-HE-staining')
sc.pl.spatial(adata_L, img_key="hires", alpha_img = 1, ax = ax[0, 1], show = False, title = 'HCC-4L-HE-staining')
sc.pl.spatial(adata_N, img_key="hires", alpha_img = 1, ax = ax[0, 0], show = False, title = 'HCC-4N-HE-staining')
sc.pl.spatial(adata_T, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 2], show = False, title = 'HCC-4T-GEM-cluster')
sc.pl.spatial(adata_L, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 1], show = False, title = 'HCC-4L-GEM-cluster')
sc.pl.spatial(adata_N, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 0], show = False, title = 'HCC-4N-GEM-cluster')
plt.show()

# Extract Data for Further Analysis

In [None]:
T_cluster = pd.DataFrame(adata_T.obs['cluster'].values, columns = ['cluster'])
N_cluster = pd.DataFrame(adata_N.obs['cluster'].values, columns = ['cluster'])
L_cluster = pd.DataFrame(adata_L.obs['cluster'].values, columns = ['cluster'])

In [None]:
T_cluster.to_csv('./ClusterMap/P4T.csv', index = None)
N_cluster.to_csv('./ClusterMap/P4N.csv', index = None)
L_cluster.to_csv('./ClusterMap/P4L.csv', index = None)