# Compare four algorithms (RF, GRB, MLP, DT)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
import scipy.stats as stats

In [2]:
#read data from dataset (after categorical feature preprocessing and anomaly detection(IF))
# not the dataset after MinMaxScaler
data_aftcat_aftif = pd.read_csv(r"./1-4 5388 traindata_aftcat_aftif.csv")
data_aftcat_aftif_y = pd.read_csv(r"./1-4 5388 traindata_aftcat_aftif_label.csv")

testdata_aftcat = pd.read_csv(r"./1-1 5388 testdata_aftcat.csv")

In [3]:
# calculate average f1 and std
def score_mean_std(score_sf):
    index_a1 = []
    index_a2 = []

    for i in range(score_sf.shape[1]):
        index_a1.append(np.average(score_sf[score_sf.columns[i]]))
        index_a2.append(np.std(score_sf[score_sf.columns[i]]))

    index_a1 = pd.DataFrame(index_a1, index = score_sf.columns)
    index_a2 = pd.DataFrame(index_a2, index = score_sf.columns)
    #print(index_a1.T)
    #print(index_a2.T)
    re = pd.concat([index_a1.T,index_a2.T],axis=0)
    re.index = ["avg","std"]
    #print(re)
    return re

In [4]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ train models: RF, GRB, MLP, DT ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

mm = MinMaxScaler()
selector = VarianceThreshold(np.median(data_aftcat_aftif.var().values))
model_rf = RandomForestClassifier(random_state = 90, min_samples_split=2,n_estimators=61,max_depth=24, max_features=27,min_samples_leaf=1, n_jobs=-1)
model_grb = GradientBoostingClassifier(max_features=3,learning_rate=0.1,n_estimators=130,min_samples_split=100,min_samples_leaf=7,max_depth=15,random_state = 10)
model_mlp = MLPClassifier(random_state=1, max_iter=10000,hidden_layer_sizes = (160,160), activation='tanh',solver='adam')
model_dt = DecisionTreeClassifier(random_state=10, criterion='gini', max_depth=18, max_features=25, min_impurity_decrease=0.0, min_samples_leaf=1, splitter='best')

In [5]:
# -------------- train predictive model with pipeline: MinMaxScaler(), VarianceThreshold(), cross_validate()--------
# return score
def train_model(m, se, model, X, y):
    print(model, "\n")
    pipe_steps = [('mm',m),('selector',se),('model',model)]
    id_pipeline = Pipeline(steps=pipe_steps)

    # evaluate the pipeline using the crossvalidation technique defined in cv

    # ------------------- score ------------------------------------
    scoring = {'f1', 'precision', 'accuracy',
            'recall', 'roc_auc'}

    score_sf = cross_validate(id_pipeline, X, y.values.ravel(),cv=10,scoring=scoring, n_jobs=-1)
    score_sf_re = score_mean_std(pd.DataFrame(score_sf))

    print("score_sf_re: \n", pd.DataFrame(score_sf_re))
    return score_sf


In [25]:
# -------------------- RF -----------------------
score_sf_rf = train_model(mm, selector, model_rf, data_aftcat_aftif, data_aftcat_aftif_y)
RF = pd.DataFrame(score_sf_rf['test_f1'],columns = ["RF"])
print(RF)

RandomForestClassifier(max_depth=24, max_features=27, n_estimators=61,
                       n_jobs=-1, random_state=90) 

score_sf_re: 
      fit_time  score_time   test_f1  test_precision  test_recall  \
avg  4.389243    0.143868  0.997910        0.997459     0.998364   
std  0.817042    0.034510  0.000866        0.001558     0.000793   

     test_accuracy  test_roc_auc  
avg       0.997999      0.999985  
std       0.000830      0.000011  
         RF
0  0.998180
1  0.997275
2  0.996370
3  0.999545
4  0.997273
5  0.997730
6  0.999091
7  0.997726
8  0.998183
9  0.997730


In [26]:
# -------------------- GRB -----------------------
score_sf_grb = train_model(mm, selector, model_grb, data_aftcat_aftif, data_aftcat_aftif_y)
GRB = pd.DataFrame(score_sf_grb['test_f1'],columns = ["GRB"])
print(GRB)

GradientBoostingClassifier(max_depth=15, max_features=3, min_samples_leaf=7,
                           min_samples_split=100, n_estimators=130,
                           random_state=10) 

score_sf_re: 
      fit_time  score_time   test_f1  test_precision  test_recall  \
avg  6.683303    0.116024  0.998137        0.997913     0.998364   
std  1.261962    0.041753  0.000591        0.001342     0.001273   

     test_accuracy  test_roc_auc  
avg       0.998217      0.999986  
std       0.000565      0.000018  
        GRB
0  0.997270
1  0.997726
2  0.997280
3  0.999091
4  0.998636
5  0.998637
6  0.998637
7  0.997724
8  0.998182
9  0.998185


In [27]:
# -------------------- MLP -----------------------
score_sf_mlp = train_model(mm, selector, model_mlp, data_aftcat_aftif, data_aftcat_aftif_y)
MLP = pd.DataFrame(score_sf_mlp['test_f1'],columns = ["MLP"])
print(MLP)

MLPClassifier(activation='tanh', hidden_layer_sizes=(160, 160), max_iter=10000,
              random_state=1) 

score_sf_re: 
        fit_time  score_time   test_f1  test_precision  test_recall  \
avg  103.017205    0.108929  0.997411        0.997096     0.997727   
std   20.311620    0.028797  0.001235        0.001935     0.000932   

     test_accuracy  test_roc_auc  
avg       0.997521      0.999918  
std       0.001184      0.000105  
        MLP
0  0.999090
1  0.996820
2  0.995915
3  0.999546
4  0.995915
5  0.998636
6  0.997726
7  0.997270
8  0.996823
9  0.996367


In [28]:
# -------------------- DT -----------------------
score_sf_dt = train_model(mm, selector, model_dt, data_aftcat_aftif, data_aftcat_aftif_y)
DT = pd.DataFrame(score_sf_dt['test_f1'],columns = ["DT"])
print(DT)

DecisionTreeClassifier(max_depth=18, max_features=25, random_state=10) 

score_sf_re: 
      fit_time  score_time   test_f1  test_precision  test_recall  \
avg  0.472254    0.079859  0.995910        0.995551     0.996273   
std  0.046220    0.023368  0.000996        0.001483     0.001598   

     test_accuracy  test_roc_auc  
avg       0.996086      0.996094  
std       0.000953      0.000959  
         DT
0  0.994998
1  0.996370
2  0.995007
3  0.998182
4  0.995007
5  0.995455
6  0.996817
7  0.994989
8  0.995915
9  0.996367


In [31]:
# ^^^^ compare the four train models: RF, GRB, MLP, DT using Paired T-test^^^^^^^^^^^^^^^
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# get the t-statistics and p-value of two algorithms based on the ten-fold cross validation
# a, b are original cross validation score array

def sig_diff(p_a_b):
    if p_a_b < 0.05:
        print("There is a significant difference between the two algorithms.\n")
    else:
        print("No significant difference!\n")


stat_RF_GRB, p_RF_GRB = stats.ttest_rel(np.array(RF),np.array(GRB))
print("The p-value of RF and GRB:", p_RF_GRB)
sig_diff(p_RF_GRB)

stat_RF_MLP, p_RF_MLP = stats.ttest_rel(np.array(RF),np.array(MLP))
print("The p-value of RF and MLP:", p_RF_MLP)
sig_diff(p_RF_MLP)

stat_MLP_GRB, p_MLP_GRB = stats.ttest_rel(np.array(MLP),np.array(GRB))
print("The p-value of MLP and GRB:", p_MLP_GRB)
sig_diff(p_MLP_GRB)

stat_RF_DT, p_RF_DT = stats.ttest_rel(np.array(RF),np.array(DT))
print("The p-value of RF and DT:", p_RF_DT)
sig_diff(p_RF_DT)

stat_GRB_DT, p_GRB_DT = stats.ttest_rel(np.array(GRB),np.array(DT))
print("The p-value of GRB and DT:", p_GRB_DT)
sig_diff(p_GRB_DT)

stat_MLP_DT, p_MLP_DT = stats.ttest_rel(np.array(MLP),np.array(DT))
print("The p-value of MLP and DT:", p_MLP_DT)
sig_diff(p_MLP_DT)


The p-value of RF and GRB: [0.345183]
No significant difference!

The p-value of RF and MLP: [0.11076935]
No significant difference!

The p-value of MLP and GRB: [0.1041164]
No significant difference!

The p-value of RF and DT: [1.02794833e-05]
There is a significant difference between the two algorithms.

The p-value of GRB and DT: [1.18676005e-05]
There is a significant difference between the two algorithms.

The p-value of MLP and DT: [0.00505091]
There is a significant difference between the two algorithms.



# Conclusion

The p-value of RF and GRB: [0.345183]
No significant difference!

The p-value of RF and MLP: [0.11076935]
No significant difference!

The p-value of MLP and GRB: [0.1041164]
No significant difference!

The p-value of RF and DT: [1.02794833e-05]
There is a significant difference between the two algorithms.

The p-value of GRB and DT: [1.18676005e-05]
There is a significant difference between the two algorithms.

The p-value of MLP and DT: [0.00505091]
There is a significant difference between the two algorithms.