In [1]:
import csv
import os
import shap
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_val_predict, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, auc, precision_recall_fscore_support
from tqdm import tqdm

In [2]:
#UTF-8 encoding issue

def pkl_dump(data, file, output_path):
    with open(output_path + file, "wb") as fw:
        pkl.dump(data, fw, pkl.HIGHEST_PROTOCOL)

def pkl_load(file, input_path):
    with open(input_path + file, "rb") as fr:
        data = pkl.load(fr)
    return data

def load_data(features_path,features_filename,data_filename):
    fea2id, features = pkl_load(f"{features_filename}", features_path)
    data = pkl_load(f"{data_filename}", features_path)
    return fea2id,data

def convert_to_dataframe(fea2id,patient_data):
    patient_with_features_list=[]

    for i in range(len(patient_data)):
        temp=[patient_data[i][0],patient_data[i][1]]
        for j in range(1, len(fea2id)+1):
            if j in patient_data[i][1:]:
                temp.append(1)
            else:
                temp.append(0)
        patient_with_features_list.append(temp)

    patient_with_features = pd.DataFrame(patient_with_features_list, columns = ['pid','outcome']+list(fea2id.keys()))
    return patient_with_features

In [3]:
def run_experiment(clf, params, task, nb, nit, model_type, train_test_path, output_path):
    print(f"current task: {task} {model_type}")
    model_dump = f"{task}year_{model_type}_model.pkl"

    tr_dx, tr_dy, ts_dx, ts_dy = pkl_load(f"expr_data_CC{task}yr_train_test.pkl", train_test_path)
    
    cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                    cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                    verbose=1, n_iter=nit)
    cv_model.fit(tr_dx, tr_dy)
    opt_clf = cv_model.best_estimator_
    pkl_dump(opt_clf, model_dump, output_path)

    preds = opt_clf.predict_proba(ts_dx)
    pkl_dump(preds, f"{task}year_{model_type}_preds.pkl", output_path)

    idx = np.argmax(opt_clf.classes_)
    preds_1 = list(map(lambda x: x[idx], preds))

    auc_score = roc_auc_score(ts_dy, preds_1)
    fprs, tprs, ths = roc_curve(ts_dy, preds_1)
    print("auc_score is : ",auc_score)
    
    J_idx = np.argmax(tprs - fprs)
    fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
    auc_score1 = auc(fprs, tprs)

    sen = tpr
    spe = 1 - fpr
    stats = [sen, spe, auc_score1]

# 0 YEAR

In [4]:
features_path = '/mnt/data1/chong/2021-CRC/updated_data/encoding_files/'
features_filename = 'data_CC0yr_expr_features.pkl'
data_filename = 'data_CC0yr_expr.pkl'
output_path = '/mnt/data1/chong/2021-CRC/updated_data/'

In [5]:
fea2id,patient_data = load_data(features_path,features_filename,data_filename)

In [6]:
patient_with_features_0yr = convert_to_dataframe(fea2id,patient_data)

In [7]:
patient_with_features_0yr.head()

Unnamed: 0,pid,outcome,age_40_49,SEX_M,Race_03,Hispanic_N,med_p_26225,med_p_8745,med_p_6628,med_p_9863,...,proc_No charge,diag_24200,diag_7014,diag_25201,proc_9929,proc_0695,med_p_298869,lab_41005-0,diag_255.1,med_p_860807
0,11e75060d2b34f18907a0050569ea8fb,0.0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11e75060db07ec1ebb6a0050569ea8fb,0.0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11e750610fb3ce06bc3a0050569ea8fb,1.0,1,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,11e75061126eaa76a7850050569ea8fb,1.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11e75060e4aded54907a0050569ea8fb,0.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
patient_with_features_0yr.to_csv(f"{output_path}/{data_filename}.csv", index=False)

# 1 YEAR

In [9]:
case_control_path = '/mnt/data1/chong/2021-CRC/updated_data/psm_result/'
features_path = '/mnt/data1/chong/2021-CRC/updated_data/encoding_files/'
features_filename = 'data_CC1yr_expr_features.pkl'
data_filename = 'data_CC1yr_expr.pkl'

In [10]:
fea2id,patient_data = load_data(features_path,features_filename,data_filename)

In [11]:
patient_with_features_1yr = convert_to_dataframe(fea2id,patient_data)

In [12]:
patient_with_features_1yr.head()

Unnamed: 0,pid,outcome,age_40_49,SEX_M,Race_03,Hispanic_N,med_p_26225,med_p_8745,med_p_6628,med_p_9863,...,diag_K580,proc_6823,proc_0518F,diag_24200,diag_7014,diag_25201,proc_9929,proc_0695,med_p_298869,med_p_860807
0,11e75060d2b34f18907a0050569ea8fb,0.0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11e75060db07ec1ebb6a0050569ea8fb,0.0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11e750610fb3ce06bc3a0050569ea8fb,1.0,1,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,11e75061126eaa76a7850050569ea8fb,1.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11e75060e4aded54907a0050569ea8fb,0.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
patient_with_features_1yr.to_csv(f"{output_path}/{data_filename}.csv", index=False)

# 3 YEAR

In [14]:
case_control_path = '/mnt/data1/chong/2021-CRC/updated_data/psm_result/'
case_control_filename = 'matched_case_control_CC_3yr.csv'
features_path = '/mnt/data1/chong/2021-CRC/updated_data/encoding_files/'
features_filename = 'data_CC3yr_expr_features.pkl'
data_filename = 'data_CC3yr_expr.pkl'

In [15]:
fea2id,patient_data = load_data(features_path,features_filename,data_filename)

In [16]:
patient_with_features_3yr = convert_to_dataframe(fea2id,patient_data)

In [17]:
patient_with_features_3yr.head()

Unnamed: 0,pid,outcome,age_30_39,SEX_F,Race_03,Hispanic_N,med_p_1908,med_p_2599,med_p_5224,med_p_253182,...,diag_34,Race_04,diag_251.8,diag_524.3,diag_78841,diag_6262,proc_6823,med_p_298869,med_p_1300786,med_p_860807
0,11e750610fb3ce06bc3a0050569ea8fb,1.0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11e75060e4aded54907a0050569ea8fb,0.0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11e75060af8bbd5eab480050569ea8fb,0.0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11e75061023aa5ceab480050569ea8fb,0.0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11e75060abfb056494360050569ea8fb,0.0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
patient_with_features_3yr.to_csv(f"{output_path}/{data_filename}.csv", index=False)

# 5 YEAR

In [19]:
case_control_path = '/mnt/data1/chong/2021-CRC/updated_data/psm_result/'
case_control_filename = 'matched_case_control_CC_5yr.csv'
features_path = '/mnt/data1/chong/2021-CRC/updated_data/encoding_files/'
features_filename = 'data_CC5yr_expr_features.pkl'
data_filename = 'data_CC5yr_expr.pkl'

In [20]:
fea2id,patient_data = load_data(features_path,features_filename,data_filename)

In [21]:
patient_with_features_5yr = convert_to_dataframe(fea2id,patient_data)

In [22]:
patient_with_features_5yr.head()

Unnamed: 0,pid,outcome,age_30_39,SEX_F,Race_03,Hispanic_N,med_p_2599,med_p_274783,med_p_9863,med_p_6185,...,diag_443.0,diag_962.0,proc_8319,proc_7791,diag_524.3,diag_1121,diag_78841,med_p_1359546,med_p_856836,med_p_860807
0,11e750610fb3ce06bc3a0050569ea8fb,1.0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11e75060af8bbd5eab480050569ea8fb,0.0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11e75060a6aa55609be90050569ea8fb,0.0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,11e750611fcf32c694250050569ea8fb,0.0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11e75060d74e55b8baf60050569ea8fb,0.0,1,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
patient_with_features_5yr.to_csv(f"{output_path}/{data_filename}.csv", index=False)