In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

import scanpy as sc
import seaborn as sns

In [2]:
#scRNA data
GSE120575_adata = sc.read_h5ad("data/Pred/sc/GSE120575.h5ad").raw.to_adata()

#bulk data
Riaz_TPM = pd.read_csv('data/Pred/bulk/Riaz_TPM.csv', index_col=0)
Van_TPM = pd.read_csv('data/Pred/bulk/Van_TPM.csv', index_col=0)
Hugo_TPM = pd.read_csv('data/Pred/bulk/Hugo_TPM.csv', index_col=0)
Lee_TPM = pd.read_csv('data/Pred/bulk/Lee_TPM.csv', index_col=0)
Gide_TPM = pd.read_csv('data/Pred/bulk/Gide_TPM.csv', index_col=0)
MGH_TPM = pd.read_csv('data/Pred/bulk/MGH_TPM.csv', index_col=0)

Riaz_anno = pd.read_csv('data/Pred/bulk/Riaz_anno.csv', index_col=0)
Van_anno = pd.read_csv('data/Pred/bulk/Van_anno.csv', index_col=0)
Hugo_anno = pd.read_csv('data/Pred/bulk/Hugo_anno.csv', index_col=0)
Lee_anno = pd.read_csv('data/Pred/bulk/Lee_anno.csv', index_col=0)
Gide_anno = pd.read_csv('data/Pred/bulk/Gide_anno.csv', index_col=0)
MGH_anno = pd.read_csv('data/Pred/bulk/MGH_anno.csv', index_col=0)

#gene intersection
bulk_intersection_genes = Riaz_TPM.columns.intersection(Van_TPM.columns).intersection(
    Gide_TPM.columns).intersection(MGH_TPM.columns).intersection(
    Hugo_TPM.columns).intersection(Lee_TPM.columns)  #16128
sc_intersection_genes = GSE120575_adata.var.index.intersection(bulk_intersection_genes)  # 10617

GSE120575_adata_pre = GSE120575_adata[(GSE120575_adata.obs.prepost == "Pre"), sc_intersection_genes]

#Response label
y_GSE120575 = GSE120575_adata_pre.obs["response"].map({"R": 0, "NR": 1})
y_Riaz = Riaz_anno["R"]
y_Lee = Lee_anno["R"]
y_Gide = Gide_anno["R"]
y_MGH = MGH_anno["R"]
y_Hugo = Hugo_anno["R"]
y_VanAllen = Van_anno["R"]

relate_genes = pd.read_table("./data/Pred/NCBI_Melanoma_gene.txt")["Symbol"]  #1425
GSE120575_adata_pre = GSE120575_adata_pre[:, GSE120575_adata_pre.var.index.intersection(relate_genes)]
GSE120575_adata_pre  

View of AnnData object with n_obs × n_vars = 5928 × 1425
    obs: 'response', 'patient', 'cluster_all', 'isCD8', 'cluster_cd8', 'isDNDP', 'prepost', 'pNum', 'gender', 'age', 'therapy', 'survival_days', 'status', 'biopsyNum', 'plate', 'MNum', 'rMito', 'cluster', 'cluster_n', 'prepost_n', 'response_n', 'cluster_gn'
    uns: 'hvg', 'neighbors', 'pca', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    obsp: 'connectivities', 'distances'

In [15]:
from utils import get_genepairs
from fisher import pvalue_npy


for a in [0,1,2]:
    pairs=get_genepairs(GSE120575_adata_pre.to_df(), GSE120575_adata_pre.obs["response"], 200000, a)
    c=pairs.columns.map(lambda x: x.split("_")[0])[2:].unique()
    _, _, twosided = pvalue_npy(pairs[f'{c[0]}_1'].values.astype(np.uint),
                                        pairs[f'{c[0]}_-1'].values.astype(np.uint),
                                        pairs[f'{c[1]}_1'].values.astype(np.uint),
                                        pairs[f'{c[1]}_-1'].values.astype(np.uint))
    pairs["p"] = twosided
    pairs.to_csv(f"result/Pred/pair_a{a}.csv")

100%|██████████| 1014600/1014600 [02:18<00:00, 7324.37it/s]
100%|██████████| 1014600/1014600 [02:08<00:00, 7913.47it/s]
100%|██████████| 1014600/1014600 [02:07<00:00, 7932.36it/s]


In [3]:
from utils import dftopair, test_by_ml

for a1 in [0,1,2]:
    result_df=pd.DataFrame()
    pairs = pd.read_csv(f"result/Pred/pair_a{a1}.csv", index_col=0)
    
    sc_pairs_p = pairs.sort_values(by="p")[:100]
    
    a2 = 0
    pairs0 = sc_pairs_p["pairs0"]
    pairs1 = sc_pairs_p["pairs1"]
    
    #ours
    x_Riaz = dftopair(Riaz_TPM, pairs0, pairs1)
    x_Lee = dftopair(Lee_TPM, pairs0, pairs1)
    x_Gide = dftopair(Gide_TPM, pairs0, pairs1)
    x_MGH = dftopair(MGH_TPM, pairs0, pairs1)
    x_Hugo = dftopair(Hugo_TPM, pairs0, pairs1)
    x_VanAllen = dftopair(Van_TPM, pairs0, pairs1)
    
    for j in range(10, 110, 10):
        clfs, result = test_by_ml(3, 0, [ "RF","KNN","LR","SVC","MLP","XGB","NB"], np.vstack((x_Riaz[:,:j],x_VanAllen[:,:j])),
                               np.hstack((y_Riaz, y_VanAllen)),
                          [
                              (x_Riaz[:,:j], y_Riaz),
                              (x_VanAllen[:,:j], y_VanAllen),
                              (x_Hugo[:,:j], y_Hugo),
                              (x_Lee[:,:j], y_Lee),
                              (x_Gide[:,:j], y_Gide),
                              (x_MGH[:,:j], y_MGH),
                              (np.vstack((x_Lee[:,:j], x_Gide[:,:j], x_MGH[:,:j],x_Hugo[:,:j])),
                               np.hstack((y_Lee, y_Gide, y_MGH,y_Hugo)))
                          ])
        result["sig"] = f"Ours_{j}"
        result.insert(0, 'a1', a1)
        result.insert(1, 'pairs_num', j)
        result_df = pd.concat([result_df, result])
    result_df.to_csv(f"result/Pred/result_a{a1}.csv")