In [8]:
import csv
import os
import shap
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_val_predict, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, auc, precision_recall_fscore_support
from sklearn.decomposition import PCA
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
#UTF-8 encoding issue

def pkl_dump(data, file):
    with open(file, "wb") as fw:
        pkl.dump(data, fw)

        
def pkl_load(file):
    with open(file, "rb") as fr:
        data = pkl.load(fr)
    return data

def pkl4_dump(data, file):
    with open(file, "wb") as fw:
        pkl.dump(data, fw, pkl.HIGHEST_PROTOCOL)

        
def pkl4_load(file):
    with open(file, "rb") as fr:
        data = pkl.load(fr)
    return data

In [3]:
os.chdir('/mnt/data1/songziwu/data/crc_data/')
os.getcwd()

'/mnt/data1/songziwu/data/crc_data'

Add performance report on training data cross-validation

In [4]:
def expr(clf, params, tasks, nb=-1, nit=100, model_name='LR'):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=13), 
                                      verbose=1, iid=True, n_iter=nit, random_state=13)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)

        J_idx = np.argmax(tprs - fprs)
        fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
        auc_score1 = auc(fprs, tprs)

        sen = tpr
        spe = 1 - fpr
        stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
        with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
            wr = csv.writer(fp, dialect='excel')
            wr.writerow(['sen','spef','auc'])
            wr.writerow(stats)
        
       
        with open(f"master_{task}SVM_CCshap.csv","a") as sp:
            sp = csv.writer(ir, dialect='excel')
            sp.writerow(['feature','shap_value'])
            sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

In [4]:
def to_matrix(all_data, col_num, num_fea_cols):
    pids = []
    matrix = []
    for idx, d in enumerate(all_data):
        m = np.zeros(col_num + 1)
        mp = []
        if num_fea_cols == -1:
            d1 = d
        else:
            d1 = d[:-num_fea_cols]
        for i, e in enumerate(d1):
            if i == 0:
                m[0] = e
            else:
                m[e] = 1.
        
        if num_fea_cols == -1:
            d2 = []
        else:
            d2 = d[-num_fea_cols:]
        for e in d2:
            mp.append(e)
        
        pids.append(idx)
        nmn = np.concatenate((m, np.array(mp)))
        matrix.append(nmn)
    print(np.array(matrix).shape)
    print(matrix[1])
    return matrix, pids

def imputation(matrix, pids, findings=True, labs=True):
    cols = ['label'] + features
    cols_to_imp = []
    if findings:
        cols += findings_list
        cols_to_imp += findings_list
    
    if labs:
        cols += valid_loinc
        cols_to_imp += valid_loinc
        
    df = pd.DataFrame(data=matrix, index=pids, columns=cols)
#     print(df.head())
#     cau = dict()
#     cou = dict()
#     for col in cols_to_imp:
#         cau[col] = [e for e in set(df[df['label']==1.0][col]) if not pd.isna(e)]
#         cou[col] = [e for e in set(df[df['label']==0.0][col]) if not pd.isna(e)]
        
    np.random.seed(47)
    for col in cols_to_imp:
        s = list(set([e for e in df[col] if not np.isnan(e)]))
        for idx, c in enumerate(df[col]):
            if pd.isna(c):
                # choose case or control 
#                 if df['label'][idx] == 1.0:
#                     s = cau[col]
#                 elif df['label'][idx] == 0.0:
#                     s = cou[col]


                df[col][idx] = np.random.choice(s, 1)
                
#     print(df.head())
    matrix = np.array(df)
    print(matrix.shape, matrix[0])
    return matrix, cols[1:]

def create_data(matrix):
    np.random.seed(13)
    np.random.shuffle(matrix)
    np.random.seed(47)
    np.random.shuffle(matrix)
    dx = []
    dy = []
    for each in matrix:
        dx.append(each[1:])
        dy.append(each[0])
    dx = np.array(dx)
    dy = np.array(dy)
    print(dx.shape, dy.shape)
    return dx, dy

In [5]:
tuned_parameters = {
    'max_iter': range(100, 4100, 500),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

In [9]:
clf = svm.SVC(probability=True)

# 0 YEAR

In [None]:
ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")

In [16]:
for i in range(100):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr(clf, tuned_parameters, tasks = [1], nb = 10, nit = 20, model_name = 'SVM')
    

current task: 0 RF
(5681, 10045) (1421, 10045)
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

# 1 YEAR

In [None]:
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC1yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC1yr_expr.pkl")

In [None]:
for i in range(100):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

# split train and test
    print(data[0][0])
#print(len(data))
    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
        #print(f"{pid} is not an id in train test")
        count = count + 1
# coln = len(valid_loinc) + len(findings_list)
    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    ts_dx, ts_dy = create_data(matrix)

# pkl_dump(feas, "./expr_data_5yr_features.pkl")
    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC1yr_train_test.pkl")
# pkl4_dump((), "./expr_data_5yr_test.pkl")
    
    expr(clf, tuned_parameters, tasks = [1], nb = 10, nit = 20, model_name = 'SVM')

# 3 YEAR

In [None]:
ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_3yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC3yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC3yr_expr.pkl")

In [None]:
for i in range(100):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

# split train and test
    print(data[0][0])
#print(len(data))
    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
        #print(f"{pid} is not an id in train test")
            count = count + 1
# coln = len(valid_loinc) + len(findings_list)
    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    ts_dx, ts_dy = create_data(matrix)

# pkl_dump(feas, "./expr_data_5yr_features.pkl")
    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC3yr_train_test.pkl")
# pkl4_dump((), "./expr_data_5yr_test.pkl")
    expr(clf, tuned_parameters, tasks = [3], nb = 10, nit = 20, model_name = 'SVM')

# 5 YEAR

In [None]:
ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_5yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC5yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC5yr_expr.pkl")

In [None]:
for i in range(100):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

# split train and test
    print(data[0][0])
#print(len(data))
    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
        #print(f"{pid} is not an id in train test")
            count = count + 1
# coln = len(valid_loinc) + len(findings_list)
    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)
# matrix, feas = imputation(matrix, pids)
    ts_dx, ts_dy = create_data(matrix)

# pkl_dump(feas, "./expr_data_5yr_features.pkl")
    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC5yr_train_test.pkl")
# pkl4_dump((), "./expr_data_5yr_test.pkl")
    expr(clf, tuned_parameters, tasks = [5], nb = 10, nit = 20, model_name = 'SVM')

## Untuned model

In [11]:


clf = svm.SVC(probability=True)


def expr(clf, tasks, model_name):

    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")

        

        clf.fit(tr_dx, tr_dy)

        pkl_dump(clf, model_dump)

        preds = clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)

#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

        return auc_score






for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    #print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr(clf, tasks = [0], model_name = 'SVM')
    




  0%|          | 0/5 [00:00<?, ?it/s][A

(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM



 20%|██        | 1/5 [08:31<34:05, 511.46s/it][A

auc_score is :  0.7248173535302248
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM



 40%|████      | 2/5 [17:04<25:35, 511.84s/it][A

auc_score is :  0.68383192352658
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM



 60%|██████    | 3/5 [26:07<17:22, 521.43s/it][A

auc_score is :  0.68383192352658
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM



 80%|████████  | 4/5 [35:14<08:49, 529.09s/it][A

auc_score is :  0.68383192352658
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM



100%|██████████| 5/5 [44:25<00:00, 535.48s/it][A

auc_score is :  0.68383192352658


## nit test

In [13]:
clf1 = svm.SVC(probability=True)
def expr1(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr1(clf1, tuned_parameters, tasks = [0], nb = 1, nit = 20, model_name = 'SVM')
    


  0%|          | 0/5 [00:00<?, ?it/s][A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Solver terminated early (max_iter=600).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
Solver terminated early (max_iter=600).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
Solver terminated early (max_iter=600).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
Solver terminated early (max_iter=600).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
Solver terminated early (max_iter=600).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


KeyboardInterrupt: 

In [12]:
clf2 = svm.SVC(probability=True)
def expr2(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr2(clf2, tuned_parameters, tasks = [0], nb = 10, nit = 10, model_name = 'SVM')
    


  0%|          | 0/5 [00:00<?, ?it/s][A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 75.4min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 100.7min finished

 20%|██        | 1/5 [1:48:12<7:12:51, 6492.79s/it][A

auc_score is :  0.6818100543291383
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 77.1min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 103.8min finished
Solver terminated early (max_iter=3100).  Consider pre-processing your data with StandardScaler or MinMaxScaler.

 40%|████      | 2/5 [3:49:54<5:36:46, 6735.53s/it][A

auc_score is :  0.7328427077857602
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 75.7min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 100.0min finished
Solver terminated early (max_iter=3100).  Consider pre-processing your data with StandardScaler or MinMaxScaler.

 60%|██████    | 3/5 [5:47:22<3:47:38, 6829.35s/it][A

auc_score is :  0.6501500150015
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 76.5min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 103.2min finished
Solver terminated early (max_iter=3100).  Consider pre-processing your data with StandardScaler or MinMaxScaler.

 80%|████████  | 4/5 [7:47:53<1:55:49, 6949.77s/it][A

auc_score is :  0.6501500150015
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 76.4min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 102.6min finished
Solver terminated early (max_iter=3100).  Consider pre-processing your data with StandardScaler or MinMaxScaler.

100%|██████████| 5/5 [9:49:03<00:00, 7045.92s/it]  [A

auc_score is :  0.6501500150015


## Hyperparameter test - max_iter

In [14]:
tuned_parameters_max_iter1 = {
    #'max_iter': range(100, 4100, 500),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

In [15]:
clf1 = svm.SVC(probability=True)
def expr1(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr1(clf1, tuned_parameters_max_iter1, tasks = [0], nb = 10, nit = 20, model_name = 'SVM')
    



  0%|          | 0/5 [00:00<?, ?it/s][A[A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 53.3min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 224.9min finished


 20%|██        | 1/5 [4:07:34<16:30:16, 14854.19s/it][A[A

auc_score is :  0.7669158572519917
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 52.9min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 220.4min finished


 40%|████      | 2/5 [8:10:31<12:18:33, 14771.06s/it][A[A

auc_score is :  0.7221658412492219


Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/home/wusongzi/anaconda3/lib/python3.7/multiprocessing/util.py", line 189, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/wusongzi/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/backend/synchronize.py", line 96, in _cleanup
    sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/home/wusongzi/anaconda3/lib/python3.7/multiprocessing/util.py", line 189, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/wusongzi/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/backend/synchronize.py", line 96, in _cleanup
    sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/home

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 53.0min


KeyboardInterrupt: 

In [17]:
tuned_parameters_max_iter2 = {
    'max_iter': range(100, 1000, 100),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

clf2 = svm.SVC(probability=True)
def expr2(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr2(clf2, tuned_parameters_max_iter2, tasks = [0], nb = 10, nit = 20, model_name = 'SVM')
    




  0%|          | 0/5 [00:00<?, ?it/s][A[A[A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 20.3min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 78.5min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.



 20%|██        | 1/5 [1:25:28<5:41:52, 5128.14s/it][A[A[A

auc_score is :  0.6983382979966123
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 1. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 20.2min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 79.3min finished
Solver terminated early (max_iter=800).  Consider pre-processing your data with StandardScaler or MinMaxScaler.



 40%|████      | 2/5 [2:50:50<4:16:19, 5126.42s/it][A[A[A

auc_score is :  0.6854488431268122
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 1. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 21.0min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 82.3min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.



 60%|██████    | 3/5 [4:19:59<2:53:06, 5193.15s/it][A[A[A

auc_score is :  0.6565180915266163
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 1. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 21.3min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 81.3min finished
Solver terminated early (max_iter=800).  Consider pre-processing your data with StandardScaler or MinMaxScaler.



 80%|████████  | 4/5 [5:48:05<1:27:00, 5220.90s/it][A[A[A

auc_score is :  0.6854488431268122
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 1. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


KeyboardInterrupt: 

In [None]:
tuned_parameters_max_iter3 = {
    'max_iter': range(100, 500, 50),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

clf3 = svm.SVC(probability=True)
def expr3(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr3(clf3, tuned_parameters_max_iter3, tasks = [0], nb = 10, nit = 20, model_name = 'SVM')
    





  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 16.6min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 54.2min finished
Solver terminated early (max_iter=450).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 20%|██        | 1/5 [58:47<3:55:08, 3527.23s/it][A[A[A[A

auc_score is :  0.6683773792669498
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 16.3min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 54.7min finished
Solver terminated early (max_iter=450).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 40%|████      | 2/5 [1:58:22<2:57:04, 3541.59s/it][A[A[A[A

auc_score is :  0.7124256970179322
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 17.2min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 58.4min finished
Solver terminated early (max_iter=400).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 60%|██████    | 3/5 [3:00:52<2:00:08, 3604.02s/it][A[A[A[A

auc_score is :  0.6908960470621995
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


In [20]:
tuned_parameters_max_iter4 = {
    'max_iter': range(50, 100, 10),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

clf4 = svm.SVC(probability=True)
def expr4(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr4(clf4, tuned_parameters_max_iter4, tasks = [0], nb = 10, nit = 20, model_name = 'SVM')
    





  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  4.9min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 19.5min finished
Solver terminated early (max_iter=90).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 20%|██        | 1/5 [20:53<1:23:35, 1253.93s/it][A[A[A[A

auc_score is :  0.5510510079632686
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  4.9min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 19.5min finished
Solver terminated early (max_iter=90).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 40%|████      | 2/5 [41:49<1:02:43, 1254.45s/it][A[A[A[A

auc_score is :  0.4677378056586864
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[0. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


KeyboardInterrupt: 

## Hyperparameter test - C

In [18]:
tuned_parameters_c1 = {
    'max_iter': range(100, 500, 50),
    'tol': [0.0001, 0.001, 0.01, 0.1],
    #'C': range(1, 50, 2),
    'class_weight': [None, 'balanced']
    }

clf1 = svm.SVC(probability=True)
def expr1(clf, params, tasks, nb, nit, model_name):
    for task in tasks:
        print(f"current task: {task} {model_name}")
        res_output = f"{task}year_{model_name}.txt"
        model_dump = f"{task}year_{model_name}_model.pkl"

        fea2iD, features = pkl_load(f"./03_MQ_Encoding_Files/data_CC{task}yr_expr_features.pkl")
        tr_dx, tr_dy, ts_dx, ts_dy = pkl4_load(f"./03_MQ_Encoding_Files/expr_data_CC{task}yr_train_test.pkl")
        #print(tr_dx.shape, ts_dx.shape)
        
        cv_model = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=nb, 
                                      cv=StratifiedKFold(n_splits=5, shuffle=True), 
                                      verbose=1, iid=True, n_iter=nit)
        cv_model.fit(tr_dx, tr_dy)
        best_5_cv = cv_model.best_estimator_
        opt_clf = cv_model.best_estimator_
        pkl_dump(opt_clf, model_dump)

        preds = opt_clf.predict_proba(ts_dx)
        pkl_dump(preds, f"{task}year_{model_name}_preds.pkl")

        idx = np.argmax(opt_clf.classes_)
        preds_1 = list(map(lambda x: x[idx], preds))

        auc_score = roc_auc_score(ts_dy, preds_1)
        fprs, tprs, ths = roc_curve(ts_dy, preds_1)
        print("auc_score is : ",auc_score)
        
        return auc_score
#         J_idx = np.argmax(tprs - fprs)
#         fpr, tpr, th = fprs[J_idx], tprs[J_idx], ths[J_idx]
#         auc_score1 = auc(fprs, tprs)

#         sen = tpr
#         spe = 1 - fpr
#         stats = [sen, spe, auc_score1]
        
# change below to shap for SVM

#         shap.initjs()
#         explainer = shap.TreeExplainer(opt_clf)
#         shap_values = explainer.shap_values(tr_dx)
#         top10 = shap_values[:10][:]
        
#         with open(f"master_{task}SVM_CCstats.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow(['sen','spef','auc'])
#             wr.writerow(stats)
        
       
#         with open(f"master_{task}SVM_CCshap.csv","a") as sp:
#             sp = csv.writer(ir, dialect='excel')
#             sp.writerow(['feature','shap_value'])
#             sp.writerows([top10])
            

#         with open(res_output, "w") as f:
#             f.write(f'''
# auc1: {auc_score}
# auc2: {auc_score1}
# sensitivity: {sen}
# specificity: {spe}
# J: {th}
#             ''')

ptIDs = pd.read_csv("01_MQ_Incident_Match_Files/matched_case_control_CC_01yr.csv",usecols=['PATID'],dtype =str)
ptIDs.head()
fea2id, features = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr_features.pkl")
data = pkl_load("./03_MQ_Encoding_Files/data_CC0yr_expr.pkl")



for i in tqdm(range(5)):
    train_id, test_id = train_test_split(ptIDs,test_size=0.2)
    test_id.head(), train_id.head(), test_id.shape, train_id.shape
    test_ids = test_id.PATID.to_list()
    train_ids = train_id.PATID.to_list()

    #print(data[0][0])

    trains = []
    tests = []
    count = 0
    for dp in data:
        pid = dp[0]
        ndata = dp[1:]
        if pid in train_ids:
            trains.append(ndata)
        elif pid in test_ids:
            tests.append(ndata)
        else:
            count = count + 1

    print(count)
    matrix, pids = to_matrix(trains, len(fea2id), -1)

    tr_dx, tr_dy = create_data(matrix)

    matrix, pids = to_matrix(tests, len(fea2id), -1)

    ts_dx, ts_dy = create_data(matrix)


    pkl4_dump((tr_dx, tr_dy, ts_dx, ts_dy), "./03_MQ_Encoding_Files/expr_data_CC0yr_train_test.pkl")


##Algorithm
    expr1(clf1, tuned_parameters_c1, tasks = [0], nb = 10, nit = 20, model_name = 'SVM')
    





  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A

0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 1. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 31.0min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 75.3min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 20%|██        | 1/5 [1:21:51<5:27:25, 4911.29s/it][A[A[A[A

auc_score is :  0.7061149628627549
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 30.9min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 75.6min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 40%|████      | 2/5 [2:43:57<4:05:46, 4915.62s/it][A[A[A[A

auc_score is :  0.6869854753290334
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 32.4min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 77.5min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 60%|██████    | 3/5 [4:08:08<2:45:12, 4956.47s/it][A[A[A[A

auc_score is :  0.6869854753290334
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 31.9min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 77.5min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




 80%|████████  | 4/5 [5:33:26<1:23:24, 5004.71s/it][A[A[A[A

auc_score is :  0.6869854753290334
0
(5681, 10046)
[0. 0. 0. ... 0. 0. 0.]
(5681, 10045) (5681,)
(1421, 10046)
[1. 0. 0. ... 0. 0. 0.]
(1421, 10045) (1421,)
current task: 0 SVM
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 33.5min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 76.7min finished
Solver terminated early (max_iter=900).  Consider pre-processing your data with StandardScaler or MinMaxScaler.




100%|██████████| 5/5 [6:56:37<00:00, 5000.65s/it]  [A[A[A[A

auc_score is :  0.6869854753290334
