In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

import os

import re
import random
from termcolor import colored
# import dataframe_image as dfi
# import warnings
# import wandb
# warnings.filterwarnings('ignore')
# warnings.warn('DelftStack')
# warnings.warn('Do not show this message')

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from IPython.display import Audio, display
def allDone():
    display(Audio(url='https://www.mediacollege.com/downloads/sound-effects/beep/beep-10.wav', autoplay=True))

embed_path = 'data_test'
result_path = 'ML_predicted_results'
wt_mt_path = 'data_test/wt_mt'

In [10]:
def train_test_dataset(situation, mode):
    if 'imbalance' in situation:
        mode_train = np.load(f'../data/{situation}/For_ML/{mode}_train.npy')
        mode_test = np.load(f'../data/{situation}/For_ML/{mode}_test.npy')

        X_train = mode_train[:,:-1]
        y_train = mode_train[:,-1].astype(int).tolist()
        X_test = mode_test[:,:-1]
        y_test = mode_test[:,-1].astype(int).tolist()
        
        return X_train, y_train, X_test, y_test
    
    else:
        mode_train1 = np.load(f'../data/{situation}/For_ML/{mode}_train_1.npy')
        mode_train2 = np.load(f'../data/{situation}/For_ML/{mode}_train_2.npy')
        mode_train3 = np.load(f'../data/{situation}/For_ML/{mode}_train_2.npy')
        mode_test = np.load(f'../data/{situation}/For_ML/{mode}_test.npy')
        
        X_train1, X_train2, X_train3 = mode_train1[:,:-1], mode_train2[:,:-1], mode_train3[:,:-1]
        y_train1, y_train2, y_train3 = mode_train1[:,-1].astype(int).tolist(), mode_train2[:,-1].astype(int).tolist(), mode_train3[:,-1].astype(int).tolist()
        X_test, y_test = mode_test[:,:-1], mode_test[:,-1].astype(int).tolist()
        
        
        return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test

In [11]:
# majority and mean prediction votes
def majority_vote(model1, model2, model3, mode_test):
    pred1 = model1.predict_proba(mode_test)
    pred2 = model2.predict_proba(mode_test)
    pred3 = model3.predict_proba(mode_test)
    
    # mean
    stack = pred1 + pred2 + pred3
    stack_mean = stack/3
    mean_vote = [np.argmax(i) for i in stack_mean]

    major_binary_1 = [np.argmax(pred1[i]) for i in range(len(pred1))]
    major_binary_2 = [np.argmax(pred2[i]) for i in range(len(pred2))]
    major_binary_3 = [np.argmax(pred3[i]) for i in range(len(pred3))]

    major_vote = []
    for i in range(len(major_binary_1)):
        tmp_result = []
        tmp_result.append(major_binary_1[i])
        tmp_result.append(major_binary_2[i])
        tmp_result.append(major_binary_3[i])
        result = np.mean(tmp_result)
        if result > 0.5:
            major_vote.append(1)
        else:
            major_vote.append(0)
    
    return mean_vote, major_vote

# print out reports
def report_save(y_true, y_pred, path, mode, name, label_names=None, *args, **kv):
    result_path = f'../data/{path}/ML_predicted_results/{mode}_results'
    # print the classification report here
    report = classification_report(y_true, y_pred, target_names=label_names)
    print(colored(f'\n\t\t\t\t *** {name}_report ***:\n\n\n', 'blue', attrs=['bold']), report)
    MCC = matthews_corrcoef(y_true, y_pred)
    print(f"{name} MCC:", MCC)

    # create report dataframe
    report_for_save = classification_report(y_true, y_pred, target_names=label_names, output_dict=True)
    report_csv = pd.DataFrame(report_for_save).transpose()
    report_csv['MCC'] = MCC

    # style.background_gradient or highlight_max
    report_styled = report_csv.style.background_gradient(subset=['precision', 'recall', 'f1-score'])
    
    # Save results
    if not os.path.isdir(f'{result_path}'):
        os.mkdir(f'{result_path}')

    # export dataframe to .png
    # dfi.export(report_styled, f'{result_path}/{name}_report.png')

    report_csv.to_csv(f'{result_path}/{name}_report_save.csv')


In [12]:
def ML_pred_for_imbalance(X_train, y_train, X_test, y_test, path, mode):
    import lightgbm as lgb
    # XGBoost
    # xgb = XGBClassifier(n_estimators = 300, tree_method='gpu_hist', gpu_id=0)
    xgb = XGBClassifier()
    xgb.fit(X_train, y_train)
    y_xgb = xgb.predict(X_test)
    report_save(y_test, y_xgb,path,mode = mode, name =  'XGBoost')
    
    # Catboost
    cbt = CatBoostClassifier(iterations=10, learning_rate=0.09, depth=10)
    cbt.fit(X_train, y_train)
    y_cbt = cbt.predict(X_test)
    report_save(y_test, y_cbt,path, mode = mode, name = 'CatBoost')
    
    # LightGBM
    d_train=lgb.Dataset(X_train, label=y_train)
    params={}
    # params['learning_rate']=0.41282313322582176
    params['learning_rate']=0.1
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='cross_entropy' #Binary target feature
    params['metric']='binary_error' #metric for binary classification
    params['max_depth']= 10
    params['n_estimators'] = 300
    # params['num_leaves'] = 34
    # params['reg_lambda'] = 0.9557019573592245
    # params['colsample_by_tree'] = 0.8506663985944544
    lgb = lgb.train(params,d_train,300)
    y_lgb = lgb.predict(X_test)
    y_lgb=y_lgb.round(0)
    y_lgb = y_lgb.astype(int)
    report_save(y_test, y_lgb,path,mode = mode, name = 'LightGBM')
    

In [5]:
def ML_pred_for_balance(X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test, path, mode):
    
    # XGBoost
    # xgb_1 = XGBClassifier(n_estimators = 300, tree_method='gpu_hist', gpu_id=0)
    xgb_1 = XGBClassifier()
    xgb_2 = XGBClassifier()
    xgb_3 = XGBClassifier()
    xgb_1.fit(X_train1, y_train1)
    xgb_2.fit(X_train2, y_train2)
    xgb_3.fit(X_train3, y_train3)
    xgb_mean_prodictions_binary, xgb_majority_vote_binary = majority_vote(xgb_1,xgb_2,xgb_3, X_test)
    report_save(xgb_mean_prodictions_binary, y_test, path, mode,'XGBoost_mean')
    report_save(xgb_majority_vote_binary, y_test, path, mode,'XGBoost_majority')

    # Catboost
    cbt_1 = CatBoostClassifier(iterations=10, learning_rate=0.09, depth=10)
    cbt_2 = CatBoostClassifier(iterations=10, learning_rate=0.09, depth=10)
    cbt_3 = CatBoostClassifier(iterations=10, learning_rate=0.09, depth=10)

    cbt_1.fit(X_train1, y_train1)
    cbt_2.fit(X_train2, y_train2)
    cbt_3.fit(X_train3, y_train3)
    cbt_mean_prodictions_binary, cbt_majority_vote_binary = majority_vote(cbt_1,cbt_2,cbt_3, X_test)
    report_save(cbt_mean_prodictions_binary, y_test,path,mode, 'Catboost_mean')
    report_save(cbt_majority_vote_binary, y_test,path, mode,'Catboost_majority')

    # LightGBM
    params={}
    params['learning_rate']=0.1
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='cross_entropy' #Binary target feature
    params['metric']='binary_error' #metric for binary classification
    params['max_depth']= 10
    params['n_estimators'] = 300
    params['n_iter'] = 500

    lgb_1 = lgb.LGBMClassifier(**params)
    lgb_2 = lgb.LGBMClassifier(**params)
    lgb_3 = lgb.LGBMClassifier(**params)

    lgb_1.fit(X_train1, y_train1)
    lgb_2.fit(X_train2, y_train2)
    lgb_3.fit(X_train3, y_train3)

    lgb_mean_prodictions_binary, lgb_majority_vote_binary = majority_vote(lgb_1,lgb_2,lgb_3, X_test)
    report_save(lgb_mean_prodictions_binary, y_test,path, mode, 'LightGBM_mean')
    report_save(lgb_majority_vote_binary, y_test,path, mode, 'LightGBM_majority')

In [18]:
X1_train1_s4, X1_train2_s4, X1_train3_s4, y1_train1_s4, y1_train2_s4, y1_train3_s4, X1_test_s4, y1_test_s4 = train_test_dataset('balanced_diff_seq', 'mode1')
X2_train1_s4, X2_train2_s4, X2_train3_s4, y2_train1_s4, y2_train2_s4, y2_train3_s4, X2_test_s4, y2_test_s4 = train_test_dataset('balanced_diff_seq', 'mode2')


In [19]:
X1_train1_s4, X1_train2_s4, X1_train3_s4, y1_train1_s4, y1_train2_s4, y1_train3_s4, X1_test_s4, y1_test_s4 = X1_train1_s4[:100,:], X1_train2_s4[:100,:], X1_train3_s4[:100,:], y1_train1_s4[:100], y1_train2_s4[:100], y1_train3_s4[:100], X1_test_s4[:100,:], y1_test_s4[:100]
X2_train2_s4, X2_train2_s4, X2_train3_s4, y2_train2_s4, y2_train2_s4, y2_train3_s4, X2_test_s4, y2_test_s4 = X2_train2_s4[:100,:], X2_train2_s4[:100,:], X2_train3_s4[:100,:], y2_train2_s4[:100], y2_train2_s4[:100], y2_train3_s4[:100], X2_test_s4[:100,:], y2_test_s4[:100]


In [22]:
ML_pred_for_balance(X1_train1_s4, X1_train2_s4, X1_train3_s4, y1_train1_s4, y1_train2_s4, y1_train3_s4, X1_test_s4, y1_test_s4, 'balanced_diff_seq', 'mode1')
# ML_pred_for_balance(X2_train1_s4, X2_train2_s4, X2_train3_s4, y2_train1_s4, y2_train2_s4, y2_train3_s4, X2_test_s4, y2_test_s4, 'balanced_diff_seq', 'mode2')




[1m[34m
				 *** XGBoost_mean_report ***:


[0m               precision    recall  f1-score   support

           0       0.38      0.87      0.53        39
           1       0.55      0.10      0.17        61

    accuracy                           0.40       100
   macro avg       0.46      0.49      0.35       100
weighted avg       0.48      0.40      0.31       100

XGBoost_mean MCC: -0.046523221712558634
[1m[34m
				 *** XGBoost_majority_report ***:


[0m               precision    recall  f1-score   support

           0       0.48      0.88      0.62        49
           1       0.45      0.10      0.16        51

    accuracy                           0.48       100
   macro avg       0.47      0.49      0.39       100
weighted avg       0.47      0.48      0.39       100

XGBoost_majority MCC: -0.038999169712218056
0:	learn: 0.6492546	total: 84ms	remaining: 756ms
1:	learn: 0.5999003	total: 259ms	remaining: 1.04s
2:	learn: 0.5586566	total: 427ms	remaining: 997ms
3:	lear



[1m[34m
				 *** LightGBM_mean_report ***:


[0m               precision    recall  f1-score   support

           0       0.35      0.79      0.48        39
           1       0.27      0.05      0.08        61

    accuracy                           0.34       100
   macro avg       0.31      0.42      0.28       100
weighted avg       0.30      0.34      0.24       100

LightGBM_mean MCC: -0.24310021486421482
[1m[34m
				 *** LightGBM_majority_report ***:


[0m               precision    recall  f1-score   support

           0       0.42      0.82      0.55        45
           1       0.27      0.05      0.09        55

    accuracy                           0.40       100
   macro avg       0.34      0.44      0.32       100
weighted avg       0.34      0.40      0.30       100

LightGBM_majority MCC: -0.1959390020617817


In [166]:
X1_train_s3.shape

(100, 2048)

In [168]:
from collections import Counter
Counter(y1_train_s3)

Counter({0: 79, 1: 21})

In [175]:
ML_pred_for_imbalance(X1_train_s3, y1_train_s3, X1_test_s3, y1_test_s3,'imbalance_diff_seq', mode = 'mode1')
# ML_pred_for_imbalance(X2_train_s3, y2_train_s3, X2_test_s3, y2_test_s3,'imbalance_diff_seq', mode = 'mode2')



[1m[34m
				 *** XGBoost_report ***:


[0m               precision    recall  f1-score   support

           0       0.89      0.79      0.83        89
           1       0.10      0.18      0.12        11

    accuracy                           0.72       100
   macro avg       0.49      0.48      0.48       100
weighted avg       0.80      0.72      0.76       100

XGBoost MCC: -0.024324681077860674
0:	learn: 0.6370890	total: 148ms	remaining: 1.33s
1:	learn: 0.5914948	total: 309ms	remaining: 1.24s
2:	learn: 0.5439692	total: 355ms	remaining: 828ms
3:	learn: 0.4959131	total: 519ms	remaining: 778ms
4:	learn: 0.4574382	total: 706ms	remaining: 706ms
5:	learn: 0.4210059	total: 896ms	remaining: 598ms
6:	learn: 0.3913038	total: 1.1s	remaining: 471ms
7:	learn: 0.3573130	total: 1.16s	remaining: 290ms
8:	learn: 0.3324870	total: 1.34s	remaining: 149ms
9:	learn: 0.3107309	total: 1.53s	remaining: 0us
[1m[34m
				 *** CatBoost_report ***:


[0m               precision    recall  f1-score   su



[1m[34m
				 *** LightGBM_report ***:


[0m               precision    recall  f1-score   support

           0       0.89      0.79      0.83        89
           1       0.10      0.18      0.12        11

    accuracy                           0.72       100
   macro avg       0.49      0.48      0.48       100
weighted avg       0.80      0.72      0.76       100

LightGBM MCC: -0.024324681077860674


In [122]:
xgb_1 = XGBClassifier()
xgb_2 = XGBClassifier()
xgb_3 = XGBClassifier()
xgb_1.fit(X1_train1_s2, y1_train1_s2)
xgb_2.fit(X1_train2_s2, y1_train2_s2)
xgb_3.fit(X1_train3_s2, y1_train3_s2)



In [123]:
pred1 = xgb_1.predict_proba(X1_test_s2)
pred2 = xgb_2.predict_proba(X1_test_s2)
pred3 = xgb_3.predict_proba(X1_test_s2)

In [136]:
# mean
stack = pred1 + pred2 + pred3
stack_mean = stack/3
mean_vote = [np.argmax(i) for i in stack_mean]

major_binary_1 = [np.argmax(pred1[i]) for i in range(len(pred1))]
major_binary_2 = [np.argmax(pred2[i]) for i in range(len(pred2))]
major_binary_3 = [np.argmax(pred3[i]) for i in range(len(pred3))]

major_result = []
for i in range(len(result_1)):
    tmp_result = []
    tmp_result.append(major_binary_1[i])
    tmp_result.append(major_binary_2[i])
    tmp_result.append(major_binary_3[i])
    result = np.mean(tmp_result)
    if result > 0.5:
        major_result.append(1)
    else:
        major_result.append(0)

In [137]:
major_result

[1, 1, 1, 1, 1]

In [132]:
major_binary_1 = [np.argmax(pred1[i]) for i in range(len(pred1))]

In [133]:
major_binary_1

[1, 1, 1, 1, 1]

In [100]:
def get_result(pred):
    result_1 = []
    for i in range(len(pred)):
        result = np.argmax(pred[i])
        result_1.append(result)
    return result_1

In [101]:
result_1 = get_result(pred1)
result_2 = get_result(pred2)
result_3 = get_result(pred3)

In [102]:
result_1

[0, 1, 1, 1, 0]

In [108]:
result_2

[0, 0, 0, 1, 1]

In [109]:
result_3

[1, 1, 0, 1, 1]

In [105]:
max_result = []
for i in range(len(result_1)):
    tmp_result = []
    tmp_result.append(result_1[i])
    tmp_result.append(result_2[i])
    tmp_result.append(result_3[i])
    result = np.mean(tmp_result)
    if result > 0.5:
        max_result.append(1)
    else:
        max_result.append(0)
        

In [107]:
max_result

[0, 1, 0, 1, 1]

In [44]:
stacked_preds = np.vstack([pred1, pred2, pred3])

In [45]:
stacked_preds.shape

(15, 2)

In [46]:
stacked_preds

array([[0.39999998, 0.6       ],
       [0.39999998, 0.6       ],
       [0.39999998, 0.6       ],
       [0.39999998, 0.6       ],
       [0.39999998, 0.6       ],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854],
       [0.27574146, 0.72425854]], dtype=float32)

In [73]:
result = np.argmax(b)
# (stacked_preds > 0.5).astype(int)

[0.39999998 0.6       ]
patho


In [64]:
mean_predictions_proba = stacked_preds.mean(axis=0)

In [None]:
result = np.argmax(b)

In [65]:
mean_predictions_proba

array([0.31716102, 0.68283904], dtype=float32)

In [61]:
y1_test_s2

[0, 1, 1, 0, 1]

In [54]:
mean_predictions_binary = (mean_predictions_proba > 0.5).astype(int)

In [55]:
mean_predictions_binary

array([0, 1])

In [58]:
majority_vote_proba  = binarized_predictions.mean(axis=0)

In [59]:
majority_vote_proba

array([0., 1.])

In [60]:
majority_vote_binary  = (majority_vote_proba > 0.5).astype(int)
majority_vote_binary

array([0, 1])

In [None]:
# majority and mean prediction votes
def majority_vote(model1, model2, model3, mode_test):
    pred1 = model1.predict_proba(mode_test)
    pred2 = model2.predict_proba(mode_test)
    pred3 = model3.predict_proba(mode_test)
    
    stacked_preds = np.vstack([pred1, pred2, pred3])

    # Take mean of all three models # use this for AUC ROC, prec-recall AUC etc.
    mean_predictions_proba = stacked_preds.mean(axis=0) # might be axis =1 just make sure that the output shape is n_predictions long

    # use this for f1 score, precision, recall, MCC etc.
    mean_predictions_binary = (mean_predictions_proba > 0.5).astype(int)

    # convert the mean predictions of each model to binary
    binarized_predictions = (stacked_preds > 0.5).astype(int)

    # Take the average of the binary votes #use this for AUC ROC, prec-recall AUC etc.
    majority_vote_proba  = binarized_predictions.mean(axis=0) # might be axis =1 just make sure that the output shape is n_predictions long

    # Convert the majority vote to binary # use this for f1 score, precision, recall, MCC etc.
    majority_vote_binary  = (majority_vote_proba > 0.5).astype(int)
    
    return mean_predictions_binary, majority_vote_binary


In [18]:
!pip install joblib



In [None]:
def data_for_downstream():
    path = "../data/pcm1/"
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat += y
    data_y = []
    data_X = []
    for i in range(len(concat)):
        data_X.append(concat[i]['x'][0])
        # data_y.append(int(concat[i]['label']))
    data_X = np.array(data_X)
    return data_X

In [64]:
import xgboost as xgb

In [67]:
model_xgb_2 = xgb.XGBClassifier()
model_xgb_2.load_model("../data/model/XGBoost_model.json")

XGBoostError: [20:57:53] /Users/runner/miniforge3/conda-bld/xgboost-split_1645117899018/work/include/xgboost/json.h:73: Invalid cast, from Integer to Boolean
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000176794394 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x00000001767bc7cc xgboost::JsonBoolean const* xgboost::Cast<xgboost::JsonBoolean const, xgboost::Value const>(xgboost::Value const*) + 660
  [bt] (2) 3   libxgboost.dylib                    0x00000001768abc98 xgboost::RegTree::LoadModel(xgboost::Json const&) + 4396
  [bt] (3) 4   libxgboost.dylib                    0x0000000176828300 xgboost::gbm::GBTreeModel::LoadModel(xgboost::Json const&) + 720
  [bt] (4) 5   libxgboost.dylib                    0x00000001768160c0 xgboost::gbm::GBTree::LoadModel(xgboost::Json const&) + 448
  [bt] (5) 6   libxgboost.dylib                    0x000000017682b3c0 xgboost::LearnerIO::LoadModel(xgboost::Json const&) + 1368
  [bt] (6) 7   libxgboost.dylib                    0x000000017679d414 XGBoosterLoadModel + 1068
  [bt] (7) 8   libffi.8.dylib                      0x0000000104e0c04c ffi_call_SYSV + 76
  [bt] (8) 9   libffi.8.dylib                      0x0000000104e0974c ffi_call_int + 1208



In [30]:
# Catboost load
cbt = CatBoostClassifier()
file_name = "../data/model/CatBoost_model"
cbt = cbt.load_model(file_name, format = 'cbm')

In [70]:
# LightGBM load
# joblib.load('lgb.pkl')
lgb = joblib.load("../data/model/LightGBM_model.pkl")

In [71]:
y_lgb = lgb.predict(data_X).round(0).astype(int)
# y_lgb=y_lgb.round(0)
# y_lgb = y_lgb.astype(int)

In [72]:
y_lgb

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
data_X = data_for_downstream()

In [50]:
data_X.shape

(37, 2048)

In [51]:
cat_pred = cbt.predict(data_X)

In [52]:
cat_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
report = classification_report(y1_test_s3, preds)

In [40]:
name = 'Catboost'
print(colored(f'\n\t\t\t\t *** {name}_report ***:\n\n\n', 'blue', attrs=['bold']), report)

[1m[34m
				 *** Catboost_report ***:


[0m               precision    recall  f1-score   support

           0       0.78      0.98      0.87      3710
           1       0.23      0.02      0.03      1023

    accuracy                           0.77      4733
   macro avg       0.51      0.50      0.45      4733
weighted avg       0.66      0.77      0.69      4733

