In [1]:
import pandas as pd
import numpy as np
from glob import glob

from scipy.io import loadmat
from ml_model_utils import *

In [2]:
base_dir = '/mnt/data0-nfs/shared-datasets/cancer_multimodal/tcga-brca/'
results_loc = '/mnt/data0-nfs/vs5/repos/structured_cca/isbi_cleaned_up/7_results/'

clinical_dir = '/mnt/data0-nfs/shared-datasets/cancer_multimodal/TCGA/clinical/'
save_dir = '/mnt/data0-nfs/shared-datasets/cancer_multimodal/TCGA/clinical_parsed/'
geno_dir = '/mnt/data0-nfs/shared-datasets/cancer_multimodal/TCGA/genomics/'
data_loc = '/mnt/data0-nfs/shared-datasets/cancer_multimodal/tcga-brca/splits/'  

In [3]:
results = []
d=50;

Run classifications using single modality and early/late fusions

In [4]:
for num_genes in [500, 800, 1000, 3000]:
    
    for fold_num in range(5):
    
        genomics, imaging, binary = get_patients_info(clinical_dir, data_loc, num_genes, fold_num)
        
        error, pred, score = RF_trainer(genomics["train"], genomics["test"],  binary["train"], binary["test"], d=d)
        results.append(["Genomics", num_genes, error[0], error[1], error[2], error[3], error[4]])

        error, pred, score = RF_trainer(imaging["train"], imaging["test"],  binary["train"], binary["test"], d=d)
        results.append(["Imaging", num_genes, error[0], error[1], error[2], error[3], error[4]])
        
        # Early fusion
        X_train = np.concatenate((genomics["train"], imaging["train"]), axis=1)
        X_test = np.concatenate((genomics["test"], imaging["test"]), axis=1)

        error, pred, score = RF_trainer(X_train, X_test,  binary["train"], binary["test"], d=2*d)
        results.append(["EF", num_genes, error[0], error[1], error[2], error[3], error[4]])
        
        error, pred, score = RF_LF_trainer(genomics["train"], genomics["test"], 
                                           imaging["train"], imaging["test"], binary["train"], binary["test"], d=d)
        results.append(["LF", num_genes, error[0], error[1], error[2], error[3], error[4]])

Run classifications using SCCA-based fusion

In [5]:
for num_genes in [500, 800, 1000, 3000]:
    
    for fold_num in range(5):
        
        genomics, imaging, binary = get_patients_info(clinical_dir, data_loc, num_genes, fold_num)
                       
        values = loadmat(results_loc + 'brca_k_scca/' + str(num_genes) + '/' + str(fold_num) + '.mat')
        U = (np.squeeze(values['U']))
        V = (np.squeeze(values['V']))

        Xproject_train = np.dot(genomics["train"],U).astype(np.float64)
        Yproject_train = np.dot(imaging["train"],V).astype(np.float64)

        Xproject_test = np.dot(genomics["test"],U).astype(np.float64)
        Yproject_test = np.dot(imaging["test"],V).astype(np.float64)

        E_train = np.concatenate((Xproject_train[:, :100], Yproject_train[:, :100]), axis=1)
        E_test = np.concatenate((Xproject_test[:, :100], Yproject_test[:, :100]), axis=1)

        error, pred, score = RF_trainer(E_train, E_test, binary["train"], binary["test"], d=d)
        results.append(["SCCAF", num_genes, error[0], error[1], error[2], error[3], error[4]])

Run classifications using GCCA-based fusion

In [6]:
for num_genes in [500, 800, 1000, 3000]:

    for fold_num in range(5):
        
        genomics, imaging, binary = get_patients_info(clinical_dir, data_loc, num_genes, fold_num)
        
        values = loadmat(results_loc + 'brca_k_gcca/' + str(num_genes) + '_no_prior/' + str(fold_num) + '.mat')
        U = (np.squeeze(values['U']))
        V = (np.squeeze(values['V']))

        Xproject_train = np.dot(genomics["train"],U).astype(np.float64)
        Yproject_train = np.dot(imaging["train"],V).astype(np.float64)

        Xproject_test = np.dot(genomics["test"],U).astype(np.float64)
        Yproject_test = np.dot(imaging["test"],V).astype(np.float64)

        E_train = np.concatenate((Xproject_train[:, :100], Yproject_train[:, :100]), axis=1)
        E_test = np.concatenate((Xproject_test[:, :100], Yproject_test[:, :100]), axis=1)

        error, pred, score = RF_trainer(E_train, E_test, binary["train"], binary["test"], d=d)
        results.append(["GNCCAF", num_genes, error[0], error[1], error[2], error[3], error[4]])

        values = loadmat(results_loc + 'brca_k_gcca/' + str(num_genes) + '_prior/' + str(fold_num) + '.mat')
        U = (np.squeeze(values['U']))
        V = (np.squeeze(values['V']))

        Xproject_train = np.dot(genomics["train"],U).astype(np.float64)
        Yproject_train = np.dot(imaging["train"],V).astype(np.float64)

        Xproject_test = np.dot(genomics["test"],U).astype(np.float64)
        Yproject_test = np.dot(imaging["test"],V).astype(np.float64)

        E_train = np.concatenate((Xproject_train[:, :100], Yproject_train[:, :100]), axis=1)
        E_test = np.concatenate((Xproject_test[:, :100], Yproject_test[:, :100]), axis=1)

        error, pred, score = RF_trainer(E_train, E_test, binary["train"], binary["test"], d=d)
        results.append(["Prior-CCAF", num_genes, error[0], error[1], error[2], error[3], error[4]])

In [7]:
columns_all = ['Method', 'num_genes', 'Acc', 'Prec', 'Rec', 'F1', 'AUC']
results_df = pd.DataFrame(results, columns=columns_all)

In [16]:
methods_all = ['Genomics', 'Imaging', 'EF', 'LF', 'SCCAF', 'GNCCAF', 'Prior-CCAF']
mapping_methods = {'Genomics': 'Genomics', 
                   'Imaging': 'Imaging\t',
                   'EF': 'Early Fusion', 
                   'LF': 'Late Fusion', 
                   'SCCAF': '100-SCCA', 
                   'GNCCAF': '100-GCCA',
                   'Prior-CCAF': '100-GCCA-Prior'}
 
num_genes_all = [500, 800, 1000, 3000]
folds = [0,1,2,3,4]

print("Method \t\t\t& " + "\t\t\t& ".join([str(x) for x in num_genes_all]))

for curr_method in methods_all:
    curr_row = [mapping_methods[curr_method]]
    for curr_gene in num_genes_all:
        for metric in ['F1']:
            temp_df = results_df[results_df['Method'] == curr_method]
            temp_df = temp_df[temp_df['num_genes'] == curr_gene]
            temp_mean = temp_df.mean()[metric]
            temp_std = temp_df.std()[metric]
            curr_row.append('{0:.2f}'.format(temp_mean*100) +  ' $\pm$ ' + '{0:.2f}'.format(temp_std*100))
    print('\t & '.join(curr_row) + ' \\\\' )
       

Method 			& 500			& 800			& 1000			& 3000
Genomics	 & 55.44 $\pm$ 1.90	 & 58.39 $\pm$ 2.56	 & 54.85 $\pm$ 2.80	 & 58.36 $\pm$ 2.29 \\
Imaging		 & 60.92 $\pm$ 1.17	 & 60.92 $\pm$ 1.17	 & 60.92 $\pm$ 1.17	 & 60.92 $\pm$ 1.17 \\
Early Fusion	 & 57.06 $\pm$ 5.55	 & 58.61 $\pm$ 3.53	 & 58.98 $\pm$ 1.01	 & 60.97 $\pm$ 1.75 \\
Late Fusion	 & 53.44 $\pm$ 2.19	 & 53.80 $\pm$ 3.20	 & 52.02 $\pm$ 3.73	 & 53.64 $\pm$ 4.04 \\
100-SCCA	 & 57.91 $\pm$ 3.40	 & 59.03 $\pm$ 2.23	 & 59.40 $\pm$ 3.41	 & 56.89 $\pm$ 0.84 \\
100-GCCA	 & 56.36 $\pm$ 3.16	 & 57.11 $\pm$ 3.02	 & 57.92 $\pm$ 0.97	 & 58.69 $\pm$ 2.16 \\
100-GCCA-Prior	 & 56.23 $\pm$ 2.23	 & 58.52 $\pm$ 4.75	 & 57.42 $\pm$ 1.84	 & 59.80 $\pm$ 2.89 \\
