encoding: utf-8

ref: https://developer.ibm.com/zh/tutorials/learn-classification-algorithms-using-python-and-scikit-learn/

ref: https://www.jianshu.com/p/8767ef42ee47

ref: https://qastack.cn/stats/52274/how-to-choose-a-predictive-model-after-k-fold-cross-validation

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import interp
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import codecs

In [28]:
# I/O training data
df = pd.read_csv('prepro_result.csv')
df.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x7,x8,x9,x10,x6_B,x6_C,x6_D,x6_E,x6_F,x6_Fx
0,1,-0.573731,-0.648684,1.007316,-0.749812,0.0,-1.244095,0.328232,-0.679849,0.220437,0,0,0,0,1,0
1,1,0.301589,-0.516504,1.13295,-1.284691,0.0,-1.160686,-1.05929,-2.063638,0.581299,0,0,0,0,1,0
2,0,1.346649,-0.472425,1.338934,-1.362361,0.0,0.762559,-0.54042,-0.34016,0.740294,0,0,0,0,1,0
3,0,-1.312901,0.062985,1.041571,-1.206,0.0,1.457944,-0.192881,1.6996,-1.401193,0,0,0,0,1,0
4,1,1.130095,-1.267077,1.052334,-0.922978,0.0,-0.253463,-1.35288,-0.295863,0.903165,0,0,0,0,1,0


## Cross-Validation

In [29]:
# split into 10-folds
cv = StratifiedKFold(n_splits=10)

# features matrix X
X = df.drop(['y'],axis=1)
# labels vector y
y = df['y']

In [30]:
def train_and_test(classifier, cv=cv, X=X, y=y):
    """
    Run classifier with cross-validation and compute accuracy
    """
    
    accuracy_list = []
    
    for train_index, test_index in cv.split(X, y):
        # train and predict
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        y_pred = classifier.fit(X_train, y_train).predict(X_test)
        
        # compute accuracy
        #print(accuracy_score(y_test, y_pred))
        accuracy_list.append(accuracy_score(y_test, y_pred))
        
    return np.mean(accuracy_list)

In [31]:
# logistic regression
lr_classifier = LogisticRegression(random_state=0, multi_class='auto', solver='lbfgs', max_iter=1000)
lr_accuracy = train_and_test(lr_classifier)
print(lr_accuracy)

0.8019999999999999


In [32]:
# KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_accuracy = train_and_test(knn_classifier)
print(knn_accuracy)

0.8099999999999999


In [33]:
# SVM
svm_classifier = SVC(kernel='rbf', gamma='auto')
svm_accuracy = train_and_test(svm_classifier)
print(svm_accuracy)

0.8560000000000001


In [34]:
# Random Forest
# apply cross-validation instead of using oob argument
rf_classifier = RandomForestClassifier(n_estimators=300, random_state=0)
rf_accuracy = train_and_test(rf_classifier)
print(rf_accuracy)

0.859


In [35]:
def train_and_test_boost(real_classifier, discrete_classifier, cv=cv, X=X, y=y):
    """
    Run boosted classifier with cross-validation and compute accuracy
    """
    # storing accuracy of 10 folds
    real_accuracy_list = []
    discrete_accuracy_list = []

    for train_index, test_index in cv.split(X, y):
        # split dataset
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # training
        real_classifier.fit(X_train, y_train)
        discrete_classifier.fit(X_train, y_train)

        # storing the accuracy after each boosting step
        # should show an increasing trend
        # define accuracy of this fold model by the last entry
        real_test_accuracy = []
        discrete_test_accuracy = []

        for real_test_predict, discrete_train_predict in zip(
            real_classifier.staged_predict(X_test), discrete_classifier.staged_predict(X_test)):
            real_test_accuracy.append(accuracy_score(real_test_predict, y_test))
            discrete_test_accuracy.append(accuracy_score(discrete_train_predict, y_test))
        
        real_accuracy_list.append(real_test_accuracy[-1])
        discrete_accuracy_list.append(discrete_test_accuracy[-1])
    
    return np.mean(real_accuracy_list), np.mean(discrete_accuracy_list)

In [36]:
# decision tree
dt_classifier = DecisionTreeClassifier(max_depth=4)
dt_accuracy = train_and_test(dt_classifier)
print(dt_accuracy)

0.8219999999999998


In [37]:
# adaboosted decision tree
bdt_real = AdaBoostClassifier(
    dt_classifier,
    n_estimators=500,
    learning_rate=0.8,
    random_state=0)

bdt_discrete = AdaBoostClassifier(
    dt_classifier,
    n_estimators=500,
    learning_rate=0.6,
    algorithm="SAMME",
    random_state=0)

print(train_and_test_boost(bdt_real, bdt_discrete))

(0.8390000000000001, 0.849)


## Train final model

In [38]:
svm_classifier.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [39]:
rf_classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [40]:
bdt_discrete.fit(X,y)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=4,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                            

## Make classification

In [41]:
# I/O test data
df_test = pd.read_csv('prepro_testset.csv')
df_test.head()

Unnamed: 0.1,Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10,x6_B,x6_C,x6_D,x6_E,x6_F,x6_Fx
0,1000,1.495516,0.284986,1.476067,-1.429385,1.0,-0.17813,1.127719,2.709625,-0.093291,0,0,0,1,0,0
1,1001,0.251302,-0.094785,-0.608043,0.417831,0.0,-0.696002,-0.931182,-1.127254,-0.633472,0,0,0,0,1,0
2,1002,1.079625,-0.510891,-1.905074,1.789209,0.0,1.108023,1.581727,-0.159929,1.271105,0,0,0,0,1,0
3,1003,0.903984,1.227435,0.744254,-1.195453,1.0,-0.212428,-0.481968,-1.40444,-0.136588,0,0,0,0,0,0
4,1004,-1.319584,0.816054,-0.997765,1.053837,0.0,0.51281,2.038102,-0.683503,-0.022813,0,0,0,0,0,0


In [42]:
# remove unnamed column
df_test_X = df_test.loc[:, ~df_test.columns.str.contains('^Unnamed')]

### SVM

In [43]:
# classification result using svm model
y_pred_svm = svm_classifier.predict(df_test_X)

In [44]:
# decoding 0,1,2 to label names
# Atsuto=0, Bob=1, Jörg=2
y_pred_svm = ['Atsuto' if x==0 else x for x in y_pred_svm]
y_pred_svm = ['Bob' if x==1 else x for x in y_pred_svm]
y_pred_svm = ['Jörg' if x==2 else x for x in y_pred_svm]

In [45]:
# I/O
with open('SVM.txt', 'w', encoding='utf-8') as f:
    for item in y_pred_svm:
        f.write("%s\n" % item)

### RF

In [46]:
# rf classification result
y_pred_rf = rf_classifier.predict(df_test_X)

In [47]:
# decoding 0,1,2 to label names
# Atsuto=0, Bob=1, Jörg=2
y_pred_rf = ['Atsuto' if x==0 else x for x in y_pred_rf]
y_pred_rf = ['Bob' if x==1 else x for x in y_pred_rf]
y_pred_rf = ['Jörg' if x==2 else x for x in y_pred_rf]

In [48]:
# I/O
with open('RF.txt', 'w', encoding='utf-8') as f:
    for item in y_pred_rf:
        f.write("%s\n" % item)

### BDT

In [49]:
# boosted decision tree
y_pred_bdt = bdt_discrete.predict(df_test_X)

In [50]:
# decoding 0,1,2 to label names
# Atsuto=0, Bob=1, Jörg=2
y_pred_bdt = ['Atsuto' if x==0 else x for x in y_pred_bdt]
y_pred_bdt = ['Bob' if x==1 else x for x in y_pred_bdt]
y_pred_bdt = ['Jörg' if x==2 else x for x in y_pred_bdt]

In [51]:
# I/O
with open('BDT.txt', 'w', encoding='utf-8') as f:
    for item in y_pred_bdt:
        f.write("%s\n" % item)

### Bagging

In [52]:
# preset final predictive list
y_pred_3 = y_pred_rf

# final predictive result gained by vote
for i in range(len(y_pred_3)):
    if y_pred_svm[i] == y_pred_rf[i]:
        y_pred_3[i] = y_pred_svm[i]
        
    elif y_pred_svm[i] == y_pred_bdt[i]:
        y_pred_3[i] = y_pred_svm[i]
        
    elif y_pred_bdt[i] == y_pred_rf[i]:
        y_pred_3[i] = y_pred_rf[i]
        
# I/O final result
with open('103333.txt', 'w', encoding='utf-8') as f:
    for item in y_pred_3:
        f.write("%s\n" % item)