In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

import os

import re
import random
from termcolor import colored
# import dataframe_image as dfi
# import warnings
# import wandb
# warnings.filterwarnings('ignore')
# warnings.warn('DelftStack')
# warnings.warn('Do not show this message')

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from IPython.display import Audio, display
def allDone():
    display(Audio(url='https://www.mediacollege.com/downloads/sound-effects/beep/beep-10.wav', autoplay=True))

embed_path = 'data_test'
result_path = 'predicted_results'
wt_mt_path = 'data_test/wt_mt'

In [102]:
# majority and mean prediction votes
def majority_vote(mode1, mode_2, mode_3, mode_test):
    pred1 = model1.predict_proba(mode_test)
    pred2 = model1.predict_proba(mode_test)
    pred3 = model1.predict_proba(mode_test)
    
    stacked_preds = np.vstack([pred1, pred2, pred3])

    # Take mean of all three models # use this for AUC ROC, prec-recall AUC etc.
    mean_predictions_proba = stacked_preds.mean(axis=0) # might be axis =1 just make sure that the output shape is n_predictions long

    # use this for f1 score, precision, recall, MCC etc.
    mean_predictions_binary = (mean_predictions_proba > 0.5).astype(int)

    # convert the mean predictions of each model to binary
    binarized_predictions = (stacked_preds > 0.5).astype(int)

    # Take the average of the binary votes #use this for AUC ROC, prec-recall AUC etc.
    majority_vote_proba  = binarized_predictions.mean(axis=0) # might be axis =1 just make sure that the output shape is n_predictions long

    # Convert the majority vote to binary # use this for f1 score, precision, recall, MCC etc.
    majority_vote_binary  = (majority_vote_proba > 0.5).astype(int)
    
    return mean_prodictions_binary, majority_vote_binary

# print out reports
def report_save(y_true, y_pred, name, path, label_names=None, *args, **kv):
    result_path = f'../data/{path}/predicted_results'
    # print the classification report here
    report = classification_report(y_true, y_pred, target_names=label_names)
    print(colored(f'\n\t\t\t\t *** {name}_report ***:\n\n\n', 'blue', attrs=['bold']), report)

    # create report dataframe
    report_for_save = classification_report(y_true, y_pred, target_names=label_names, output_dict=True)
    report_csv = pd.DataFrame(report_for_save).transpose()

    # style.background_gradient or highlight_max
    report_styled = report_csv.style.background_gradient(subset=['precision', 'recall', 'f1-score'])
    
    # Save results
    if not os.path.isdir(f'{result_path}'):
        os.mkdir(f'{result_path}')

    # export dataframe to .png
    # dfi.export(report_styled, f'{result_path}/{name}_report.png')

    report_csv.to_csv(f'{result_path}/{name}_report_save.csv')


In [98]:
def train_test_dataset_imbalance(path, mode):
    mode_train = pd.read_csv(f'{path}/{mode}_train.csv')
    mode_test = pd.read_csv(f'{path}/{mode}_test.csv')
    
    X_train = mode_train[['wt_emb','mt_emb']]
    y_train = mode_train['label']
    X_test = mode_test[['wt_emb','mt_emb']]
    y_test = mode_test['label']
    
    return X_train, y_train, X_test, y_test

In [67]:
def train_test_dataset_balance(path, mode):
    mode_train1 = pd.read_csv(f'{path}/{mode}_train_1.csv')
    mode_train2 = pd.read_csv(f'{path}/{mode}_train_2.csv')
    mode_train3 = pd.read_csv(f'{path}/{mode}_train_2.csv')
    mode_test = pd.read_csv(f'{path}/{mode}_test.csv')
    X_train1, X_train2, X_train3 = mode_train1[['wt_emb','mt_emb']], mode_train2[['wt_emb','mt_emb']], mode_train3[['wt_emb','mt_emb']]
    y_train1, y_train2, y_train3 = mode_train1['label'], mode_train2['label'], mode_train3['label']
    X_test, y_test = mode_test[['wt_emb','mt_emb']], mode_test['label']
    
    return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test
    

In [85]:
def ML_pred_for_imbalance(X_train, y_train, X_test, y_test, path):
    # XGBoost
    xgb = XGBClassifier(n_estimators = 300, tree_method='gpu_hist', gpu_id=0)
    xgb.fit(X_train, y_train)
    y_xgb = xgb.predict(X_test)
    report_save(y_test, y_xgb,path, 'XGBoost')
    
    # Catboost
    cbt = CatBoostClassifier(iterations=500, learning_rate=0.09, depth=10)
    cbt.fit(X_train, y_train)
    y_cbt = cbt.predict(X_test)
    report_save(y_test, y_cbt,path, 'CatBoost')
    
    # LightGBM
    d_train=lgb.Dataset(X_train, label=y_train)
    params={}
    # params['learning_rate']=0.41282313322582176
    params['learning_rate']=0.1
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='cross_entropy' #Binary target feature
    params['metric']='binary_error' #metric for binary classification
    params['max_depth']= 10
    params['n_estimators'] = 300
    # params['num_leaves'] = 34
    # params['reg_lambda'] = 0.9557019573592245
    # params['colsample_by_tree'] = 0.8506663985944544
    lgb = lgb.train(params,d_train,300)
    y_lgb = lgb.predict(X_test)
    y_lgb=y_lgb.round(0)
    y_lgb = y_lgb.astype(int)
    report_save(y_test, y_lgb,path,'LightGBM')
    

In [82]:
def ML_pred_for_balance(X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test, path):
    
    # XGBoost
    xgb_1 = XGBClassifier(n_estimators = 300, tree_method='gpu_hist', gpu_id=0)
    xgb_2 = XGBClassifier()
    xgb_3 = XGBClassifier()
    xgb_1.fit(X_train1, y_train1)
    xgb_2.fit(X_train2, y_train2)
    xgb_3.fit(X_train3, y_train3)
    xgb_mean_prodictions_binary, xgb_majority_vote_binary = majority_vote(xgb_1,xgb_2,xgb_3, X_test)
    report_save(xgb_mean_prodictions_binary, y_test, path,'XGBoost_mean')
    report_save(xgb_majority_vote_binary, y_test, path,'XGBoost_majority')

    # Catboost
    cbt_1 = CatBoostClassifier(iterations=500, learning_rate=0.09, depth=10, n_estimators = 300)
    cbt_2 = CatBoostClassifier(iterations=500, learning_rate=0.09, depth=10, n_estimators = 300)
    cbt_3 = CatBoostClassifier(iterations=500, learning_rate=0.09, depth=10, n_estimators = 300)

    cbt_1.fit(X_train1, y_train1)
    cbt_2.fit(X_train2, y_train2)
    cbt_3.fit(X_train3, y_train3)
    cbt_mean_prodictions_binary, cbt_majority_vote_binary = majority_vote(cbt_1,cbt_2,cbt_3, X_test)
    report_save(cbt_mean_prodictions_binary, y_test,path, 'Catboost_mean')
    report_save(cbt_majority_vote_binary, y_test,path, 'Catboost_majority')

    # LightGBM
    params={}
    params['learning_rate']=0.1
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='cross_entropy' #Binary target feature
    params['metric']='binary_error' #metric for binary classification
    params['max_depth']= 10
    params['n_estimators'] = 300
    params['n_iter'] = 500

    lgb_1 = LGBMClassifier(**params)
    lgb_2 = LGBMClassifier(**params)
    lgb_3 = LGBMClassifier(**params)

    lgb_1.fit(X_train1, y_train1)
    lgb_2.fit(X_train2, y_train2)
    lgb_3.fit(X_train3, y_train3)

    lgb_mean_prodictions_binary, lgb_majority_vote_binary = majority_vote(lgb_1,lgb_2,lgb_3, X_test)
    report_save(lgb_mean_prodictions_binary, y_test,path, 'LightGBM_mean')
    report_save(lgb_majority_vote_binary, y_test,path, 'LightGBM_majority')



## Situation_1
imbalance + same seq, Train:Test = 3:1

In [99]:
X1_train_s1, y1_train_s1, X1_test_s1, y1_test_s1 = train_test_dataset_imbalance('../data/imbalance_same_seq', 'mode_1')
X2_train_s1, y2_train_s1, X2_test_s1, y2_test_s1 = train_test_dataset_imbalance('../data/imbalance_same_seq', 'mode_2')

In [None]:
ML_pred_for_imbalance(X1_train_s1, y1_train_s1, X1_test_s1, y1_test_s1,'imbalance_same_seq')
ML_pred_for_imbalance(X2_train_s1, y2_train_s1, X2_test_s1, y2_test_s1,'imbalance_same_seq')

## Situation 2

In [70]:
X1_train1_s2, X1_train2_s2, X1_train3_s2, y1_train1_s2, y1_train2_s2, y1_train3_s2, X1_test_s2, y1_test_s2 = train_test_dataset_balance('../data/balanced_same_seq', 'mode_1')
X2_train1_s2, X2_train2_s2, X2_train3_s2, y2_train1_s2, y2_train2_s2, y2_train3_s2, X2_test_s2, y2_test_s2 = train_test_dataset_balance('../data/balanced_same_seq', 'mode_2')


In [None]:
ML_pred_for_balance(X1_train1_s2, X1_train2_s2, X1_train3_s2, y1_train1_s2, y1_train2_s2, y1_train3_s2, X1_test_s2, y1_test_s2, 'balanced_same')
ML_pred_for_balance(X2_train1_s2, X2_train2_s2, X2_train3_s2, y2_train1_s2, y2_train2_s2, y2_train3_s2, X2_test_s2, y2_test_s2, 'balanced_same')


## Situation 3

In [103]:
X1_train_s3, y1_train_s3, X1_test_s3, y1_test_s3 = train_test_dataset_imbalance('../data/imbalance_diff_seq', 'mode_1')
X2_train_s3, y2_train_s3, X2_test_s3, y2_test_s3 = train_test_dataset_imbalance('../data/imbalance_diff_seq', 'mode_2')

In [None]:
ML_pred_for_imbalance(X1_train_s3, y1_train_s3, X1_test_s3, y1_test_s3,'imbalance_diff_seq')
ML_pred_for_imbalance(X2_train_s3, y2_train_s3, X2_test_s3, y2_test_s3,'imbalance_diff_seq')

## Situation 4

In [77]:
X1_train1_s4, X1_train2_s4, X1_train3_s4, y1_train1_s4, y1_train2_s4, y1_train3_s4, X1_test_s4, y1_test_s4 = train_test_dataset_balance('../data/balanced_diff_seq', 'mode_1')
X2_train1_s4, X2_train2_s4, X2_train3_s4, y2_train1_s4, y2_train2_s4, y2_train3_s4, X2_test_s4, y2_test_s4 = train_test_dataset_balance('../data/balanced_diff_seq', 'mode_2')


In [81]:
len(X2_train1_s4)

13193

In [None]:
ML_pred_for_balance(X1_train1_s4, X1_train2_s4, X1_train3_s4, y1_train1_s4, y1_train2_s4, y1_train3_s4, X1_test_s4, y1_test_s4, 'balanced_diff_seq')
ML_pred_for_balance(X2_train1_s4, X2_train2_s4, X2_train3_s4, y2_train1_s4, y2_train2_s4, y2_train3_s4, X2_test_s4, y2_test_s4, 'balanced_diff_seq')
