Prepare data before running nHDP, we have 2 version of inputs here, binary gene count matrix input and log-transformed UMI input.

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
from scipy.io import mmread
sc.set_figure_params(figsize=(6, 6), frameon=False)
sc.settings.n_jobs=8

In [None]:
HCC_1N = './raw_data/Adjacent/HCC-1N'
HCC_2N = './raw_data/Adjacent/HCC-2N'
HCC_3N = './raw_data/Adjacent/HCC-3N'
HCC_4N = './raw_data/Adjacent/HCC-4N'

HCC_1L = './raw_data/Leading_Edge/HCC-1L'
HCC_2L = './raw_data/Leading_Edge/HCC-2L'
HCC_3L = './raw_data/Leading_Edge/HCC-3L'
HCC_4L = './raw_data/Leading_Edge/HCC-4L'

HCC_1T = './raw_data/Primary_Tumor/HCC-1T'
HCC_2T = './raw_data/Primary_Tumor/HCC-2T'
HCC_3T = './raw_data/Primary_Tumor/HCC-3T'
HCC_4T = './raw_data/Primary_Tumor/HCC-4T'

visium_paths = [HCC_1N, HCC_2N, HCC_3N, HCC_4N, HCC_1L, HCC_2L, HCC_3L, HCC_4L, HCC_1T, HCC_2T, HCC_3T, HCC_4T]
visium_names = ['HCC-'+ str(i + 1) + 'N-Adjacent'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'L-Leading_Edge'for i in range(4)] + \
               ['HCC-'+ str(i + 1) + 'T-Primary_Tumor'for i in range(4)]

In [None]:
adata_l = []
for i in range(12):
    adata = sc.read_visium(visium_paths[i])
    adata_l.append(adata)

In [None]:
ST_genes = list(adata_l[0].var_names)

In [None]:
Epithelial_genes = pd.read_csv('./nHDP/lastest_nHDP/gene_names/merged_Epithelial_gene.txt', header = None).values.reshape(-1)
Myeloid_genes = pd.read_csv('./nHDP/lastest_nHDP/gene_names/merged_Myeloid_gene.txt', header = None).values.reshape(-1)
PlasmaB_genes = pd.read_csv('./nHDP/lastest_nHDP/gene_names/merged_PlasmaB_gene.txt', header = None).values.reshape(-1)
Stromal_genes = pd.read_csv('./nHDP/lastest_nHDP/gene_names/merged_Stromal_gene.txt', header = None).values.reshape(-1)
TNK_genes = pd.read_csv('./nHDP/lastest_nHDP/gene_names/merged_TNK_gene.txt', header = None).values.reshape(-1)

In [None]:
def find_common_genes_and_padding_genes(ST_genes, celltype_genes):
    common_genes = []
    padding_genes = []
    for gene in celltype_genes:
        if gene in ST_genes:
            common_genes.append(gene)
        else:
            padding_genes.append(gene)
    return common_genes, padding_genes

In [None]:
Epithelial_common, Epithelial_padding = find_common_genes_and_padding_genes(ST_genes, Epithelial_genes)
Myeloid_common, Myeloid_padding = find_common_genes_and_padding_genes(ST_genes, Myeloid_genes)
PlasmaB_common, PlasmaB_padding = find_common_genes_and_padding_genes(ST_genes, PlasmaB_genes)
Stromal_common, Stromal_padding = find_common_genes_and_padding_genes(ST_genes, Stromal_genes)
TNK_common, TNK_padding = find_common_genes_and_padding_genes(ST_genes, TNK_genes)

In [None]:
#since all genes exist in ST_genes, we omit padding genes
df_genes = [Epithelial_common,Myeloid_common,PlasmaB_common,Stromal_common,TNK_common]

In [None]:
celltype_names = ['Epithelial', 'Myeloid', 'PlasmaB', 'Stromal', 'TNK']

# UMI-Log Norm Inference

In [None]:
def log_normalize(arr):
    total_UMI = np.sum(arr, axis = 1).reshape(-1, 1)
    arr_transformed = np.log((10000 * arr / total_UMI) + 1)
    return np.ceil(arr_transformed)

In [None]:
for i, adata in enumerate(adata_l):
    print('Progress: ', i + 1)
    for k, common in enumerate(df_genes):
        adata.var_names_make_unique()
        adata_union = adata[:, common]
        adata_arr = adata_union.X.toarray()
        adata_arr = log_normalize(adata_arr)
        adata_df = pd.DataFrame(adata_arr, index = adata_union.obs_names, columns = common)
        adata_df.to_csv('./nHDP/lastest_nHDP/ST_counts/' + visium_names[i] + '_' + celltype_names[k] + '_final.csv')

# Binary Inference

In [None]:
def binarize_arr(arr):
    arr[arr > 0] = 1
    return arr

In [None]:
for i, adata in enumerate(adata_l):
    print('Progress: ', i + 1)
    for k, common in enumerate(df_genes):
        adata.var_names_make_unique()
        adata_union = adata[:, common]
        adata_arr = adata_union.X.toarray()
        adata_arr = binarize_arr(adata_arr)
        adata_df = pd.DataFrame(adata_arr, index = adata_union.obs_names, columns = common)
        adata_df.to_csv('./nHDP/lastest_nHDP/ST_counts/binary_' + visium_names[i] + '_' + celltype_names[k] + '_final.csv')