Evaluate the dataset on default parameters in many algorithms 

In autoPK, it mentioned 6 starting algorithms 
1) AdaBoost
2) Decision tree
3) Extra tree #I omit them
4) Random Forest
5) Extra trees
6) XGBoost

In [1]:
import pandas as pd
import numpy as np
import time
import sys
import os
import argparse
import joblib
import re

In [2]:
#Use extension to accelerate sklearn over Intel
#Install from pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [59]:
n_cores = -1

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [4]:
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix

Data import

In [5]:
result_fld_path = '/home/korawich/Desktop/AutoML/Dataset/Bioresponse/Result/'

In [6]:
#Datafile 
#set data path
save_path = '/home/korawich/Desktop/AutoML/Dataset/Bioresponse'
#set save_path
train_save_path = save_path + '/data_train.csv'
val_save_path   = save_path + '/data_val.csv'
test_save_path  = save_path + '/data_test.csv'

#load files
train_df = pd.read_csv(train_save_path, index_col=0)
val_df   = pd.read_csv(val_save_path, index_col=0)
test_df  = pd.read_csv(test_save_path, index_col=0)

#set X&y
X_train = train_df.loc[:, train_df.columns != 'target'].values
y_train  = train_df.loc[:, train_df.columns == 'target'].values

X_val = val_df.loc[:, val_df.columns != 'target'].values
y_val  = val_df.loc[:, val_df.columns == 'target'].values

X_test = test_df.loc[:, test_df.columns != 'target'].values
y_test  = test_df.loc[:, test_df.columns == 'target'].values

In [10]:
print('number of datapoints - all:', X_train.shape[0]+X_val.shape[0]+X_test.shape[0])

print('number of datapoints - train:', X_train.shape[0])
print('number of datapoints - val:', X_val.shape[0])
print('number of datapoints - test:', X_test.shape[0])

print('number of features:', X_train.shape[1])


number of datapoints - all: 3434
number of datapoints - train: 2060
number of datapoints - val: 687
number of datapoints - test: 687
number of features: 419


Running algorithms

In [61]:
for algorithm in ['ADABOOST', 'DecisionTree', 'RandomForest', 'ExtraTrees', 'XGBOOST']:
    print('run algorithm - ', algorithm)
    result_cm = pd.DataFrame() #Confusion matrix for kCV
    blind_cm = pd.DataFrame() #Confusion matrix for blind
    outerblind_cv_metrics = pd.DataFrame()
    cv_metrics = pd.DataFrame()

    #Label Encoding
    le = LabelEncoder()
    le.fit(y_train)
    num_of_class = len(le.classes_)
    y_train = le.transform(y_train)

    #Kfold cross-validation
    predicted_n_actual_pd = pd.DataFrame(columns=['ID', 'predicted', 'actual', 'fold'])
    outerblind_predicted_n_actual_pd = pd.DataFrame(columns=['ID', 'predicted', 'actual'])

    kfold = 5
    random_state=123
    kfold_obj = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=random_state)
    folds = kfold_obj.split(X_train, y_train)

    for fold, (train_idx, val_idx) in enumerate(folds, 1):
        # For example, if '1' from train or test, it would be '1' in X_train
        print('\trun fold - ', fold)
        
        X_train_cv, X_val_cv, y_train_cv, y_val_cv = X_train[train_idx], X_train[val_idx], y_train[train_idx], y_train[val_idx]
    
        if (algorithm == 'ADABOOST'):
            classifier_cv = AdaBoostClassifier(n_estimators=300, random_state=1)
    
        elif (algorithm == 'DecisionTree'):
            classifier_cv = DecisionTreeClassifier(random_state=1)
    
        elif (algorithm == 'RandomForest'):
            classifier_cv = RandomForestClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)
    
        elif (algorithm == 'ExtraTrees'):
            classifier_cv = ExtraTreesClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)
    
        elif (algorithm == 'XGBOOST'):
            classifier_cv = XGBClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)

        classifier_cv.fit(X_train, y_train)

        temp_prediction = classifier_cv.predict(X_val_cv)

        temp_proba = pd.DataFrame(classifier_cv.predict_proba(X_val_cv))
        temp_proba = temp_proba.rename(columns=lambda x: re.sub('^','proba_',str(x)))

        _temp_pd = pd.DataFrame({'ID':val_idx, 'actual':y_val_cv, 'predicted' : temp_prediction, 'fold':fold})
        _temp_pd = pd.concat([_temp_pd,temp_proba],axis=1)
        predicted_n_actual_pd = pd.concat([predicted_n_actual_pd, _temp_pd],ignore_index=True, sort=True)
        fold += 1
    
    predicted_n_actual_pd.sort_values(by='ID', inplace=True)
    
    column_list = predicted_n_actual_pd.columns.tolist()
    proba_columns = [name for name in column_list if 'proba_'in name]
    roc_auc = round(roc_auc_score(predicted_n_actual_pd['actual'].to_list(),predicted_n_actual_pd[proba_columns].iloc[:,1]),3)

    f1 = round(f1_score(predicted_n_actual_pd['actual'].to_list(),predicted_n_actual_pd['predicted'].to_list()),3)

    matthews = round(matthews_corrcoef(predicted_n_actual_pd['actual'].to_list(),predicted_n_actual_pd['predicted'].to_list()),3)
    balanced_accuracy = round(balanced_accuracy_score(predicted_n_actual_pd['actual'].to_list(),predicted_n_actual_pd['predicted'].to_list()),3)

    tn, fp, fn, tp = confusion_matrix(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()).ravel()
    result_cm = pd.DataFrame(np.column_stack((tn,tp,fn,fp)),columns=['tn','tp','fn','fp'])

    cv_metrics = pd.concat([cv_metrics,pd.DataFrame(np.column_stack(['cv',roc_auc, matthews,balanced_accuracy, f1 ]), columns=['type','roc_auc','matthew','bacc','f1'])], ignore_index=True, sort=True)
    print('\tfinish running cv')
    
    #Test validation

    if (algorithm == 'ADABOOST'):
        classifier_all = AdaBoostClassifier(n_estimators=300, random_state=1)

    elif (algorithm == 'DecisionTree'):
        classifier_all = DecisionTreeClassifier(random_state=1)

    elif (algorithm == 'RandomForest'):
        classifier_all = RandomForestClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)

    elif (algorithm == 'ExtraTrees'):
        classifier_all = ExtraTreesClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)

    elif (algorithm == 'XGBOOST'):
        classifier_all = XGBClassifier(n_estimators=300, random_state=1, n_jobs=n_cores)

    classifier_all.fit(X_train,y_train)
    X_blind = X_test
    y_blind = y_test
    y_blind = le.transform(y_blind)

    ## outerblind-Test
    prediction = classifier_all.predict(X_blind)        
    proba = pd.DataFrame(classifier_all.predict_proba(X_blind))
    proba = proba.rename(columns=lambda x: re.sub('^','proba_',str(x)))

    _temp_pd = pd.DataFrame({'actual':y_blind, 'predicted' : prediction})
    _temp_pd = pd.concat([_temp_pd,proba],axis=1)

    outerblind_predicted_n_actual_pd = pd.concat([outerblind_predicted_n_actual_pd, _temp_pd],ignore_index=True, sort=True)

    outerblind_matthews_corrcoef = round(matthews_corrcoef(y_blind, prediction),3)
    outerblind_balanced_accuracy_score = round(balanced_accuracy_score(y_blind, prediction),3)
    
    column_list = outerblind_predicted_n_actual_pd.columns.tolist()
    proba_columns = [name for name in column_list if 'proba_'in name]
    outerblind_roc_auc_score = round(roc_auc_score(outerblind_predicted_n_actual_pd['actual'].to_list(), outerblind_predicted_n_actual_pd[proba_columns].iloc[:,1]),3)

    outerblind_f1_score = round(f1_score(y_blind, prediction),3)

    outerblind_tn, outerblind_fp, outerblind_fn, outerblind_tp = confusion_matrix(y_blind, prediction).ravel()
    blind_cm = pd.DataFrame(np.column_stack((outerblind_tn,outerblind_tp,outerblind_fn,outerblind_fp)),columns=['tn','tp','fn','fp'])

    outerblind_cv_metrics = pd.concat([outerblind_cv_metrics,pd.DataFrame(np.column_stack(['blind-test',outerblind_roc_auc_score, outerblind_matthews_corrcoef,
        outerblind_balanced_accuracy_score, outerblind_f1_score]),
         columns=['type','roc_auc','matthew','bacc','f1'])], ignore_index=True, sort=True)
    outerblind_cv_metrics.set_index([['blind-test']*len(outerblind_cv_metrics)], inplace=True)

    print('\tfinish running blind test')

    cv_metrics = cv_metrics.round(3)
    outerblind_cv_metrics = outerblind_cv_metrics.round(3)

    print('\tresult from {} :'.format(algorithm))
    print('\tcv')
    print('cv_metrics)
    print('\tblind test')
    print(outerblind_cv_metrics)
    print('finish runnig algorithm - ', algorithm)
    print('\n\n')

run algorithm -  ADABOOST
	run fold -  1
	run fold -  2
	run fold -  3
	run fold -  4
	run fold -  5
	finish running cv
	finish running blind test
	result from ADABOOST :
	cv
	     bacc     f1 matthew roc_auc type
0  0.925  0.926    0.85   0.984   cv
	blind test
             bacc     f1 matthew roc_auc        type
blind-test  0.725  0.722   0.451   0.794  blind-test
finish runnig algorithm -  ADABOOST



run algorithm -  DecisionTree
	run fold -  1
	run fold -  2
	run fold -  3
	run fold -  4
	run fold -  5
	finish running cv
	finish running blind test
	result from DecisionTree :
	cv
	   bacc   f1 matthew roc_auc type
0  1.0  1.0     1.0     1.0   cv
	blind test
             bacc     f1 matthew roc_auc        type
blind-test  0.705  0.704    0.41   0.705  blind-test
finish runnig algorithm -  DecisionTree



run algorithm -  RandomForest
	run fold -  1
	run fold -  2
	run fold -  3
	run fold -  4
	run fold -  5
	finish running cv
	finish running blind test
	result from RandomForest :
	