In [0]:
# import packages
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

## Data Loading

In [0]:
df_adult = pd.read_csv('/content/drive/My Drive/Colab/adult.csv', header=None)
df_adult.head()

In [0]:
df_bank = pd.read_csv('/content/drive/My Drive/Colab/bank_note.csv', header=None)
df_bank.head()

In [0]:
df_car = pd.read_csv('/content/drive/My Drive/Colab/car.csv', header=None)
df_car.head()

## Data cleaning

In [0]:
# # replace label
# df_adult[14].replace([' <=50K', ' >50K'], [0, 1], inplace=True)
# # one hot encode
# df_adult = pd.get_dummies(df_adult, columns=[1, 3, 5, 6, 7, 8, 9, 13])
# # put label column to the end
# label = df_adult.pop(14)
# df_adult[14] = label

# label encode the label
enc = LabelEncoder()
df_adult = df_adult.apply(enc.fit_transform)
# convert to np array
adult_XandY = df_adult.values
df_adult.head()

In [0]:
# convert to np array
bank_XandY = df_bank.values

In [0]:
# encode the categorical data
enc = LabelEncoder()
df_car = df_car.apply(enc.fit_transform)
# convert to np array
car_XandY = df_car.values

## Parameter tuning

In [0]:
def CV(X, Y, model, nfolds = 3):
    if model == 'KNN':
        param_grid = {'n_neighbors':[int(x) for x in np.linspace(1, 180, num = 20)],
                      'weights':['uniform', 'distance']}
        grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=nfolds, n_jobs=-1)
        grid_search.fit(X, Y)
        return grid_search.best_estimator_, grid_search.best_params_
    
    if model == 'RF':
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        param_grid = {'n_estimators': [int(x) for x in np.linspace(100, 500, num = 100)],
                      'max_depth': max_depth,
                      'max_features': ['auto', 'sqrt', 'log2']}
        rnd_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=nfolds, n_jobs=-1)
        rnd_search.fit(X, Y)
        return rnd_search.best_estimator_, rnd_search.best_params_
    
    if model == 'BT':
        param_grid = {'n_estimators': [20, 50, 100]}
        grid_search = GridSearchCV(AdaBoostClassifier(), param_grid, cv=nfolds, n_jobs=-1)
        grid_search.fit(X, Y)
        return grid_search.best_estimator_, grid_search.best_params_

## Train models

In [0]:
def trainModel(X_train, X_test, Y_train, Y_test, model):
    # search for best parameters
    clf, params = CV(X_train, Y_train, model)
    clf.fit(X_train, Y_train)
    test_acc = clf.score(X_test, Y_test)
    train_acc = clf.score(X_train, Y_train)
    # print the result
    print(params)
    print('Training accuracy: %.3f'% train_acc)
    print('Test accuracy: %.3f\n'% test_acc)
    return test_acc

In [0]:
def trainAndTest(XandY):
    X = XandY[:,:-1]
    Y = XandY[:, -1]
    for p in [0.2, 0.5, 0.8]:
        print('With test size of:', p*100, '%')
        avg_knn_acc = 0
        avg_rf_acc = 0
        avg_bt_acc = 0 
        # do 3 trials
        for i in range(3):
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1, test_size=p)
            avg_knn_acc += trainModel(X_train, X_test, Y_train, Y_test, 'KNN')
            avg_rf_acc += trainModel(X_train, X_test, Y_train, Y_test, 'RF')
            avg_bt_acc += trainModel(X_train, X_test, Y_train, Y_test, 'BT')
        avg_knn_acc /= 3
        avg_rf_acc /= 3
        avg_bt_acc /= 3
        print('Average testing accuracy for model knn: %.3f'% avg_knn_acc)
        print('Average testing accuracy for model random forest: %.3f'% avg_rf_acc)
        print('Average testing accuracy for model adaBoost: %.3f'% avg_bt_acc)
        print('============================================================\n')

## Adult

In [0]:
# train on adult dataset
trainAndTest(adult_XandY)

## Car

In [0]:
# train on car dataset
trainAndTest(car_XandY)

## bank

In [0]:
# train on bank dataset
trainAndTest(bank_XandY)