In [None]:
import math
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# read in Vicky's merged data
data_file = "merged.csv"
raw_data = pd.read_csv(data_file)

In [None]:
# 322 gene list is not available, maybe from "Consensus genes of the literature to predict breast cancer recurrence"?
# so, I downloaded the 41 gene list from CBCG website https://cbcg.dk/causal.html just to see results
# Following the paper, I fed 41 gene list to Genecodis, which grouped these 41 genes into 191 gene sets (more than 1 gene in the set)

cbcg41_pd = pd.read_csv('enrich-input1-GO_BP.tsv',sep = '\t')
cbcg41_list_raw = cbcg41_pd['genes'].tolist()
cbcg41_sets = []

for s in cbcg41_list_raw:
    s = s.split(', ')
    #make sure there are more than one gene in any set to avoid zero standard deviation
    if len(s) > 1:
        cbcg41_sets.append(s)
print(len(cbcg41_sets))

gene_sets = cbcg41_sets
gene_list = list(set([ i for sub_list in gene_sets for i in sub_list]))
other_cols = ['PATIENT_ID','high_risk']
data = raw_data[gene_list + other_cols].copy()
data.head()

In [None]:
# calculate t-statistic for each gene set
module_list = []

for i in range(len(gene_sets)):
    module_name = 'Module_' + str(i + 1)
    module_list.append(module_name)
    n = len(gene_sets[i])
    tmp = data[gene_sets[i]].copy()
    
    #make sure there are more than one gene in any set to avoid zero standard deviation
    data[module_name] = math.sqrt(n) * tmp.mean(axis = 1, skipna = True) / tmp.std(axis = 1, skipna = True)

In [None]:
#rank modules
rank_data = pd.DataFrame([])

for i in range(data.shape[0]):
    tmp = data[module_list + other_cols].iloc[i:i + 1,:].copy()
    tmp['key'] = 1
    tmp_t = tmp[module_list].transpose()
    tmp_t['rank'] = tmp_t[i].rank(method = 'first')#avoid tied values
    tmp_back = tmp_t.transpose().iloc[1:2,:]
    tmp_back['key'] = 1
    tmp_back = tmp_back.merge(tmp[other_cols + ['key']],how = 'inner',on = 'key').drop(columns = ['key'])
    rank_data = pd.concat([rank_data, tmp_back], ignore_index = True)
    
rank_list = rank_data.values.tolist()

In [None]:
# K-fold cross-validation


# splitting rank data into folds
k_fold = 10

n = rank_data.shape[0]
n_valid = n / k_fold
n_train = n - n_valid
rank_data['random'] = np.random.randint(n)
rank_data = rank_data.sort_values('random')
rank_data = rank_data.reset_index().drop(columns = ['index'])
rank_data['training'] = 1
rank_data.loc[rank_data.index >= n_train, 'training'] = 0

print(rank_data['training'].value_counts())

train_data_h = rank_data[(rank_data['training'] == 1) & (rank_data['high_risk'] == True)].drop(columns = ['random','training'])
train_data_l = rank_data[(rank_data['training'] == 1) & (rank_data['high_risk'] == False)].drop(columns = ['random','training'])
validation_data = rank_data[rank_data['training'] == 0].drop(columns = ['random','training'])

In [None]:
# training the data
def hmm_rank(train_data, n_state):
    n = train_data.shape[0]
    n_emission = len(gene_sets)
    
    emiss_probs = np.zeros((n_state, n_emission))
    
    for i in range(n_state):
        for j in range(n_emission):
            emiss_probs[i][j] = sum(train_data['Module_' + str(j + 1)] == (i + 1)) / n

    return emiss_probs
    
eh = hmm_rank(train_data_h, 10) # train one on high, one on low risk data
el = hmm_rank(train_data_l, 10)

In [None]:
# prediction
def predict(new_list, emission_h, emission_l):
    n_state = len(emission_h)
    n = n_state
    
    start_probs = np.zeros(n_state)
    start_probs[0] = 1
    trans_probs = [[ 1 if j == i + 1 else 0 for j in range(n_state)] for i in range(n_state)]

    fh = np.zeros((n_state, n))
    fl = np.zeros((n_state, n))
    
    for t in range(n):
        for k in range(n_state):
            if t == 0:
                fh[k][t] = start_probs[k]
                fl[k][t] = start_probs[k]
            else:
                fh[k][t] = sum(fh[i][t-1] * trans_probs[i][k] for i in range(n_state))
                fl[k][t] = sum(fl[i][t-1] * trans_probs[i][k] for i in range(n_state))
            
            j=new_list.index(k+1)
            fh[k][t] *= emission_h[k][j]
            fl[k][t] *= emission_l[k][j]
            
    loglik_h = np.log(sum(fh[k][n-1] for k in range(n_state)))
    loglik_l = np.log(sum(fl[k][n-1] for k in range(n_state)))
    
    # chooses high or low risk based on which one has a higher log lik
    if loglik_h > loglik_l:
        return True, loglik_h 
    else:
        return False, loglik_l

validation_list = validation_data.values.tolist()


for i in range(len(validation_list)):
    new_list = validation_list[i]
    high_risk, p = predict(new_list, eh, el)
    validation_list[i].append(high_risk)
    validation_list[i].append(p)
pred = pd.DataFrame(validation_list, columns = validation_data.columns.tolist()+['prediction', 'loglik'])


#confusion matrix to see performance
pd.crosstab(pred['prediction'],pred['high_risk'])

In [None]:
# MCC & AUC model evaluation    
TP = sum((pred['high_risk'] == True) & (pred['prediction'] == True))
TN = sum((pred['high_risk'] == False) & (pred['prediction'] == False))
FP = sum((pred['high_risk'] == True) & (pred['prediction'] == False))
FN = sum((pred['high_risk'] == False) & (pred['prediction'] == True))
MCC = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

n_auc_curve = 10
pred = pred.sort_values('loglik')
size = int(pred.shape[0]/n_auc_curve)
start_index = 0
tpr = []
fpr = []

for i in range(n_auc_curve):
    end_index = min((i + 1) * size,pred.shape[0] - 1)
    partition = pred.iloc[start_index:end_index+1]
    TP = sum((partition['high_risk'] == True) & (partition['prediction'] == True))
    TN = sum((partition['high_risk'] == False) & (partition['prediction'] == False))
    FP = sum((partition['high_risk'] == True) & (partition['prediction'] == False))
    FN = sum((partition['high_risk'] == False) & (partition['prediction'] == True))
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  # Sensitivity
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0  # 1 - Specificity
    tpr.append(TPR)
    fpr.append(FPR)
    start_index = end_index + 1
    
# Sort by FPR for correct integration order
fpr, tpr = zip( * sorted(zip(fpr, tpr)))

# calculate AUC using trapezoidal rule
AUC = np.trapz(tpr, fpr)
print(AUC)
print(MCC)