In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
from scipy.io import mmread
from sklearn.preprocessing import normalize
import anndata as ad
import matplotlib as mpl
from harmony import harmonize
sc.set_figure_params(figsize=(6, 6), frameon=False)
sc.settings.n_jobs=8

In [None]:
TandNK_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_TNK_2T_final.csv', header = None).transpose().values
Stroma_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_2T_final.csv', header = None).transpose().values
B_cell_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_2T_final.csv', header = None).transpose().values
Epithelial_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_2T_final.csv', header = None).transpose().values
Myeloid_ST_inference_T = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_2T_final.csv', header = None).transpose().values

In [None]:
TandNK_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_TNK_2L_final.csv', header = None).transpose().values
Stroma_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_2L_final.csv', header = None).transpose().values
B_cell_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_2L_final.csv', header = None).transpose().values
Epithelial_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_2L_final.csv', header = None).transpose().values
Myeloid_ST_inference_L = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_2L_final.csv', header = None).transpose().values

In [None]:
TandNK_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_TNK_2N_final.csv', header = None).transpose().values
Stroma_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Stromal_2N_final.csv', header = None).transpose().values
B_cell_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_PlasmaB_2N_final.csv', header = None).transpose().values
Epithelial_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Epithelial_2N_final.csv', header = None).transpose().values
Myeloid_ST_inference_N = pd.read_csv('./nHDP/binary_nHDP_count_Myeloid_2N_final.csv', header = None).transpose().values

In [None]:
GEM_combined_T = np.concatenate([TandNK_ST_inference_T, Stroma_ST_inference_T, B_cell_ST_inference_T, 
                               Epithelial_ST_inference_T, Myeloid_ST_inference_T], axis = 1)
GEM_combined_L = np.concatenate([TandNK_ST_inference_L, Stroma_ST_inference_L, B_cell_ST_inference_L, 
                               Epithelial_ST_inference_L, Myeloid_ST_inference_L], axis = 1)
GEM_combined_N = np.concatenate([TandNK_ST_inference_N, Stroma_ST_inference_N, B_cell_ST_inference_N, 
                               Epithelial_ST_inference_N, Myeloid_ST_inference_N], axis = 1)

In [None]:
spot_n_T = GEM_combined_T.shape[0]
spot_n_L = GEM_combined_L.shape[0]
spot_n_N = GEM_combined_N.shape[0]

In [None]:
GEM_combined = np.concatenate([GEM_combined_T, GEM_combined_L, GEM_combined_N], axis = 0)
GEM_combined.shape

In [None]:
#do normalization vertically
GEM_combined = normalize(GEM_combined, axis=0, norm='max')

In [None]:
adata_combined = ad.AnnData(X=GEM_combined, dtype=np.float64)

In [None]:
#remove batch effects among 4 slides of this patient
#prepare for clustering
def prepare_for_umap(adata, n_comps=50, batch_effects = False, df_metadata = None):
#     sc.pp.normalize_total(adata, inplace=True)
#     print('Finish normalization!')
#     sc.pp.scale(adata)
    sc.pp.pca(adata, n_comps)
    print('Finish PCA!')
    
    if batch_effects:
        X_PCA = adata.obsm['X_pca'].copy()
        print("Processing shape: ", X_PCA.shape)
        corr_PCA = harmonize(X_PCA, df_metadata, batch_key = 'Sample ID')
        print("Finish correction!")
        adata.obsm['X_pca'] = corr_PCA
        
    sc.pp.neighbors(adata, use_rep = 'X_pca')
    sc.tl.umap(adata)

In [None]:
df_metadata = ['2T' for i in range(spot_n_T)] + ['2L' for i in range(spot_n_L)] + ['2N' for i in range(spot_n_N)]
df_metadata = pd.DataFrame(df_metadata, columns = ['Sample ID'])

In [None]:
prepare_for_umap(adata_combined, n_comps=50, batch_effects = True, df_metadata = df_metadata)

In [None]:
sc.tl.leiden(adata_combined, key_added="clusters", resolution = 0.2)

In [None]:
palette={
    "0": "blue",
    "1": "pink",
    "2": "yellow",
    "3": "red",
    "4": "purple",
    "5": "brown",
    "6": "green",
    "7": "orange",
    "8": "indigo",
    "9": "black",
    "10": "Cyan",
    "11": "Lime",
    "12": "Beige",
    "13":"white",
    "14":"Gold",
    "15": "Gray",
    "16": "Aqua",
}

In [None]:
adata_combined.obs['batch_id'] = df_metadata.values

In [None]:
sc.tl.umap(adata_combined, min_dist = 0.3, spread = 1)

# show regions in UMAP coordinates
with mpl.rc_context({'axes.facecolor':  'white',
                     'figure.figsize': [8, 8]}):
    sc.pl.umap(adata_combined, color=['clusters'], size=30,
               color_map = 'RdPu', ncols = 2, legend_loc='on data',
               legend_fontsize=20, palette = palette)

In [None]:
HCC_1N = './raw_data/Adjacent/HCC-1N'
HCC_2N = './raw_data/Adjacent/HCC-2N'
HCC_3N = './raw_data/Adjacent/HCC-3N'
HCC_4N = './raw_data/Adjacent/HCC-4N'

HCC_1L = './raw_data/Leading_Edge/HCC-1L'
HCC_2L = './raw_data/Leading_Edge/HCC-2L'
HCC_3L = './raw_data/Leading_Edge/HCC-3L'
HCC_4L = './raw_data/Leading_Edge/HCC-4L'

HCC_1T = './raw_data/Primary_Tumor/HCC-1T'
HCC_2T = './raw_data/Primary_Tumor/HCC-2T'
HCC_3T = './raw_data/Primary_Tumor/HCC-3T'
HCC_4T = './raw_data/Primary_Tumor/HCC-4T'

visium_paths = [HCC_1N, HCC_2N, HCC_3N, HCC_4N, HCC_1L, HCC_2L, HCC_3L, HCC_4L, HCC_1T, HCC_2T, HCC_3T, HCC_4T]
visium_names = ['HCC-'+ str(i + 1) + 'N-Adjacent'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'L-Leading_Edge'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'T-Primary_Tumor'for i in range(4)]

In [None]:
adata_l = []
for i in range(12):
    adata = sc.read_visium(visium_paths[i])
    adata_l.append(adata)

In [None]:
adata_T = adata_l[9]
adata_L = adata_l[5]
adata_N = adata_l[1]

In [None]:
cluster_res = list(adata_combined.obs["clusters"])

In [None]:
adata_T.obs['cluster'] = cluster_res[0 : spot_n_T]
adata_L.obs['cluster'] = cluster_res[spot_n_T : spot_n_T + spot_n_L] 
adata_N.obs['cluster'] = cluster_res[spot_n_T + spot_n_L : ]

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20,10))
sc.pl.spatial(adata_T, img_key="hires", alpha_img = 1, ax = ax[0, 2], show = False, title = 'HCC-2T-HE-staining')
sc.pl.spatial(adata_L, img_key="hires", alpha_img = 1, ax = ax[0, 1], show = False, title = 'HCC-2L-HE-staining')
sc.pl.spatial(adata_N, img_key="hires", alpha_img = 1, ax = ax[0, 0], show = False, title = 'HCC-2N-HE-staining')
sc.pl.spatial(adata_T, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 2], show = False, title = 'HCC-2T-GEM-cluster')
sc.pl.spatial(adata_L, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 1], show = False, title = 'HCC-2L-GEM-cluster')
sc.pl.spatial(adata_N, color = 'cluster', img_key="hires", alpha_img = 0.3, palette = palette, ax = ax[1, 0], show = False, title = 'HCC-2N-GEM-cluster')
plt.show()

# Extract Data for Further Analysis

In [None]:
T_cluster = pd.DataFrame(adata_T.obs['cluster'].values, columns = ['cluster'])
N_cluster = pd.DataFrame(adata_N.obs['cluster'].values, columns = ['cluster'])
L_cluster = pd.DataFrame(adata_L.obs['cluster'].values, columns = ['cluster'])

In [None]:
T_cluster.to_csv('./ClusterMap/P2T.csv', index = None)
N_cluster.to_csv('./ClusterMap/P2N.csv', index = None)
L_cluster.to_csv('./ClusterMap/P2L.csv', index = None)

# Invasive Frontier Labeling

In [None]:
def crop_slide(adata, left, right, up, down):
    all_corrs = adata.obsm['spatial'].copy()
    invasive_edge_corrs = []

    for corr in all_corrs:
        if (left <= corr[0]) and (corr[0] <= right) and (up <= corr[1]) and (corr[1] <= down):
            invasive_edge_corrs.append(corr)
    return invasive_edge_corrs, all_corrs

In [None]:
def find_rows(source, target):
    return np.where((source == target).all(axis=1))[0][0]

In [None]:
def get_distance(corr1, corr2):
    return np.sum((corr1 - corr2)**2)

In [None]:
def find_neighbours(invasive_edge_corrs, all_corrs):
#invasive_edge_dict stores the index of all the corrs of the invasive edge spots, 
# the key is its index among all the spots corrs of the slide. The values is the indices of the top 6 closest spots.
    invasive_edge_dict = dict()

    for corr in invasive_edge_corrs:
        key = find_rows(all_corrs, corr)
        invasive_edge_dict[key] = []


        #calculate the distance of this corr to other corrs in the invasive_edge
        corr_dist_to_others = []
        for corr2 in invasive_edge_corrs:
            corr_dist_to_others.append(get_distance(corr, corr2))


        #nearest 6 spots, exclude itself
        nearest_seven_spots = sorted(range(len(corr_dist_to_others)), key=lambda i: corr_dist_to_others[i])[1:7]

        for neighb in nearest_seven_spots:
            invasive_edge_dict[key].append(find_rows(all_corrs, invasive_edge_corrs[neighb]))     
        
    return invasive_edge_dict

In [None]:
def find_invasive_frontier_idx(invasive_edge_dict, adata):
    T_boundary_corr_idx = []
    N_boundary_corr_idx = []
    for corr, neighbours in invasive_edge_dict.items():
        if (adata.obs['cluster'][corr] == '1') and \
           (np.any(adata.obs['cluster'][neighbours] == '0')):
            T_boundary_corr_idx.append(corr)

        if (adata.obs['cluster'][corr] == '0') and \
           (np.any(adata.obs['cluster'][neighbours] == '1')):
            N_boundary_corr_idx.append(corr)
    
    adata.obs['invasive_frontier'] = 'Others'
    adata.obs['invasive_frontier'][T_boundary_corr_idx] = 'Tumor Frontier'
    adata.obs['invasive_frontier'][N_boundary_corr_idx] = 'Adjacent Frontier'
    
    adjacent_non_frontier = []
    for i in range(adata.shape[0]):
        if (adata.obs['cluster'][i] == '0') and (i not in N_boundary_corr_idx):
            adjacent_non_frontier.append('Adjacent Non-frontier')
        else:
            adjacent_non_frontier.append('Others')
    adata.obs['adjacent_non_frontier'] = np.array(adjacent_non_frontier)

In [None]:
def find_invasive_frontier(adata, left, right, up, down):
    invasive_edge_corrs, all_corrs = crop_slide(adata, left, right, up, down)
    invasive_edge_dict = find_neighbours(invasive_edge_corrs, all_corrs)
    find_invasive_frontier_idx(invasive_edge_dict, adata)

In [None]:
find_invasive_frontier(adata_L, 6500, 12000, 1500, 17000)

In [None]:
find_invasive_frontier(adata_T, 1000, 16500, 1000, 17000)

In [None]:
edge_palette={
    'Others': 'grey',
    'Tumor Frontier': 'red',
    'Adjacent Frontier': 'blue',
    'Adjacent Non-frontier': 'blue'
}

In [None]:
fig, ax = plt.subplots(3,2, figsize=(18,23))
sc.pl.spatial(adata_L, img_key="hires", color="cluster", groups=["0", "1"], 
              alpha=0.5, size=1.3, ax = ax[0,0], title = 'HCC-2L', show = False)

sc.pl.spatial(adata_L, img_key="hires", color="invasive_frontier", size = 1.3, alpha = 0.7,
              alpha_img=0.7, palette = edge_palette, title = 'HCC-2L Invasive Frontier', ax = ax[1, 0], show = False)

sc.pl.spatial(adata_T, img_key="hires", color="cluster", groups=["0", "1"], 
              alpha=0.5, size=1.3, ax = ax[0,1], title = 'HCC-2T', show = False)

sc.pl.spatial(adata_T, img_key="hires", color="invasive_frontier", size = 1.3, alpha = 0.7, 
              alpha_img=0.7, palette = edge_palette, ax = ax[1,1], title = 'HCC-2T Invasive Frontier', show = False)

sc.pl.spatial(adata_L, img_key="hires", color="adjacent_non_frontier", size = 1.3, alpha = 0.7, 
              alpha_img=0.7, palette = edge_palette, ax = ax[2,0], title = 'HCC-2T Non-frontier Adjacent', show = False)

sc.pl.spatial(adata_T, img_key="hires", color="adjacent_non_frontier", size = 1.3, alpha = 0.7, 
              alpha_img=0.7, palette = edge_palette, ax = ax[2,1], title = 'HCC-2L Non-frontier Adjacent', show = False)
plt.show()

# Compare Fronter and Non-frontier Adjacent Area GEM Level

We want to check whether the tumor affect the GEM expression level of neighboring adjacent area

In [None]:
import seaborn as sns

In [None]:
GEM_combined_T = np.concatenate([TandNK_ST_inference_T, Stroma_ST_inference_T, B_cell_ST_inference_T, 
                               Epithelial_ST_inference_T, Myeloid_ST_inference_T], axis = 1)
GEM_combined_normed_T = normalize(GEM_combined_T, axis=0, norm='max')
adata_T.obsm['All_GEM_inference_table'] = GEM_combined_normed_T

In [None]:
ft_aj_T = adata_T[adata_T.obs["invasive_frontier"] == 'Adjacent Frontier']
nft_aj_T = adata_T[adata_T.obs["adjacent_non_frontier"] == 'Adjacent Non-frontier']

In [None]:
TandNK_GEM_names = ['TandNK GEM ' + str(c + 1) for c in range(85)]
Stroma_GEM_names = ['Stroma GEM ' + str(c + 1) for c in range(85)]
B_cell_GEM_names = ['B cell GEM ' + str(c + 1) for c in range(85)]
Epithelial_GEM_names = ['Epithelial GEM ' + str(c + 1) for c in range(85)]
Myeloid_GEM_names = ['Myeloid GEM ' + str(c + 1) for c in range(85)]

In [None]:
GEM_names = TandNK_GEM_names + Stroma_GEM_names + B_cell_GEM_names + Epithelial_GEM_names + Myeloid_GEM_names

In [None]:
ft_aj_T_df = pd.DataFrame(ft_aj_T.obsm['All_GEM_inference_table'], columns = GEM_names)
nft_aj_T_df = pd.DataFrame(nft_aj_T.obsm['All_GEM_inference_table'], columns = GEM_names)

In [None]:
df_merged_T = pd.concat([ft_aj_T_df, nft_aj_T_df], ignore_index = True)

In [None]:
df_merged_T['cluster'] = 'Adjacent Frontier'
df_merged_T.loc[ft_aj_T_df.shape[0] : ft_aj_T_df.shape[0] + nft_aj_T_df.shape[0], 'cluster'] = 'Adjacent Non-frontier'

In [None]:
df_merged_TandNK = df_merged_T[TandNK_GEM_names + ['cluster']]
df_merged_Stroma = df_merged_T[Stroma_GEM_names + ['cluster']]
df_merged_B_cell = df_merged_T[B_cell_GEM_names + ['cluster']]
df_merged_Epithelial = df_merged_T[Epithelial_GEM_names + ['cluster']]
df_merged_Myeloid = df_merged_T[Myeloid_GEM_names + ['cluster']]

In [None]:
def compare_GEM_level(df, title_name):
    plt.figure(figsize = (50, 15))
    ax = (
        df.set_index('cluster', append=True)  # set E as part of the index
          .stack()                      # pull A - D into rows 
          .to_frame()                   # convert to a dataframe
          .reset_index()                # make the index into reg. columns
          .rename(columns={'level_2': 'GEM', 0: 'Expression'})  # rename columns
          .drop('level_0', axis='columns')   # drop junk columns
          .pipe((sns.boxplot, 'data'), x='GEM', y='Expression', hue='cluster', 
                hue_order=['Adjacent Frontier', 'Adjacent Non-frontier'], 
                palette={'Adjacent Frontier': "blue", 'Adjacent Non-frontier': "Aqua"})  
    )
    sns.despine(trim=True)
    plt.legend(loc='upper right', prop={'size': 30})
    plt.title("TandNK GEMs", fontsize = 40)
    plt.xticks(rotation='vertical', fontsize = 30)

    plt.show() 

In [None]:
compare_GEM_level(df_merged_TandNK, "TandNK GEMs")

In [None]:
compare_GEM_level(df_merged_Stroma, "Stromal GEMs")

In [None]:
compare_GEM_level(df_merged_B_cell, "B cell GEMs")

In [None]:
compare_GEM_level(df_merged_Epithelial, "Epithelial GEMs")

In [None]:
compare_GEM_level(df_merged_Myeloid, "Myeloid GEMs")