# ML Kaggle Competition

In [1]:
import csv
import numpy as np
import pandas as pd
import random

train = pd.read_csv('data.csv', sep=",")
train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

test_data = pd.read_csv('quiz.csv', sep=",")

all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')

In [2]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR
from xgboost import XGBClassifier

from sklearn.tree import ExtraTreeClassifier

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier(base_estimator=ExtraTreeClassifier()).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.75, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_et(train_data, train_labels, test_data, test_labels):
#     model = ExtraTreesClassifier(max_features=None, n_jobs=-1, random_state=1).fit(train_data, train_labels)
    model = ExtraTreesClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=40,
                                 min_samples_split=3, random_state=1,
                                 max_features=1743, max_depth=None).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_gb(train_data, train_labels, test_data, test_labels):
    model = GradientBoostingClassifier(loss='exponential', n_estimators=200, 
                                       max_features=None, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_knn(train_data, train_labels, test_data, test_labels, n_neigh):
    model = KNeighborsClassifier(n_neighbors=n_neigh).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_logistic(train_data, train_labels, test_data, test_labels):
    model = LogisticRegression(penalty='l1',
                               C=0.9029677391429398,
                               n_jobs=-1, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_nb(train_data, train_labels, test_data, test_labels):
    model = GaussianNB().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_neural(train_data, train_labels, test_data, test_labels):
    model = MLPClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=50,
                                   min_samples_split=4, random_state=1, max_features=966,
                                   max_depth=None).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_sgd(train_data, train_labels, test_data, test_labels):    
    model = SGDClassifier(loss='perceptron').fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_svm(train_data, train_labels, test_data, test_labels):    
    model = SVR().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

def cv_run_xg(train_data, train_labels, test_data, test_labels):
    model = XGBClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return error

## Simple prediction with only numerical variables

Quick benchmarking predictions only using the numerical variables.

In [3]:
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

In [None]:
random.seed(1)
n_folds = 5
num_train = len(train_data_num)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_errors = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_num.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_num.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
#     fold_error = cv_run_ada(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_neural(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_xg(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_svm(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 20)
#     fold_error = cv_run_nb(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_logistic(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_et(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_sgd(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    fold_error = cv_run_et(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))

# AdaBoost, default: 0.269290547594
# AdaBoost (n_estimators=200, learning_rate=0.5): 0.269448216614
# AdaBoost (n_estimators=200, learning_rate=0.1): 0.269053944606
# AdaBoost (base_estimator=ExtraTreeClassifier(max_depth=1)): 0.28954478716
# AdaBoost (base_estimator=ExtraTreeClassifier(max_depth=2)): 0.274588670919
# AdaBoost (base_estimator=ExtraTreeClassifier()): 0.117907238733 (?)
# Neural network, default settings: 0.25234751258
# Neural network, default ('logistic'): 0.244794532236
# XGBoost, default: 0.25718831277
# kNN, k=1: 0.139336326082
# kNN, k=2: 0.135756900122
# kNN, k=3: 0.153677542531
# kNN, k=5: 0.170470769768
# kNN, k=10: 0.192979920115
# kNN, k=20: 0.216529877817
# Gaussian NB: 0.326915637508
# Logistic Regression: 0.278767271043
# Random Forest, default, random_state=1: 0.117308028517
# Random Forest, random_state=1, criterion='entropy': 0.117615504054
# Random Forest, n_estimators=20: 0.114233264756
# Random Forest, max_features=None: 0.116054499698
# Random Forest, n_estimators=20, max_features=None: 0.11560505746
# Extra Trees, default: 0.111379161105
# Extra Trees, n_estimators=15: 0.11488761941
# Extra Trees, n_estimators=20: 0.112687928843
# Extra Trees, n_estimators=30: 0.112538147811
# Extra Trees, n_estimators=50: 0.112459332881
# Extra Trees, max_features=None: 0.11061440569
# Bagging, default: 0.114848167656
# Bagging, n_estimators=20: 0.113973028885
# SGD, loss='perceptron': 0.350607692135
# Gradient Boosting, default: 0.256360526245
# Gradient Boosting, loss='exponential', n_estimators=200, max_features=None, random_state=1: 0.253285713067

In [None]:
model = AdaBoostClassifier().fit(train_data_num, train_labels)
preds = model.predict(test_data_num)

# with open('simple_results.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerow(("Id","Prediction"))
#     writer.writerows(zip(range(1,len(preds)+1), preds))

## Incorporating Categorical Variable with One Hot Encoding

Combine training and testing data so that the encoder sees every category, the split again once the columns are ready.

In [4]:
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

In [5]:
random.seed(1)
n_folds = 5
num_train = len(train_data_combo)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_errors = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
#     fold_error = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 2)
    fold_error = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))
# AdaBoost (n_estimators=200, learning_rate=0.5): 0.117260743164
# Extra Trees, default: 0.0631676876621
#    max_features = sqrt(5932) ~ 77
# Extra Trees, max_features=100: 0.0618510371579
# Extra Trees, max_features=200: 0.060558039058
# Extra Trees, max_features=300: 0.0605028501147
# Extra Trees, max_features=400: 0.0586185419081
# Extra Trees, max_features=400, n_estimators=20: 0.0582874082484
# Extra Trees, max_features=700: 0.0591231265325
# Extra Trees, max_features=1000, n_jobs=-1, random_state=1: 0.0587289197947
# Extra Trees, max_features=1500, n_jobs=-1, random_state=1: 0.0560088826335
# Extra Trees, max_features=2500, n_jobs=-1, random_state=1: 0.0558354488247
# Extra Trees, max_features=2500, n_estimators=20, n_jobs=-1, random_state=1: 0.0560798378573
# Extra Trees, max_features=None, n_jobs=-1, random_state=1: 0.0557408324323 (0.94658)
# Extra Trees, n_jobs: -1, min_samples_leaf: 2, n_estimators: 97,
#                                  min_samples_split: 2, random_state: 1,
#                                  max_features: 2551, max_depth: None: 0.0576172547022
# Extra Trees, n_jobs=-1, min_samples_leaf=1, n_estimators=40,
#                                  min_samples_split=3, random_state=1,
#                                  max_features=1743, max_depth=None: 0.0544951412539
# Random Forest, default, random_state=1: 0.0604555488096
# Random Forest, max_features=200: 0.0577512973628
# Random Forest, n_jobs=-1, min_samples_leaf=1, n_estimators=77,
#                                    min_samples_split=2, random_state=1, max_features=771,
#                                    max_depth=None: 0.0534938540261
# Random Forest: n_jobs=-1, min_samples_leaf=1, n_estimators=71,
#                                    min_samples_split=4, random_state=1, max_features=1148,
#                                    max_depth=None: 0.054361106674
# Random Forest: n_jobs=-1, min_samples_leaf=1, n_estimators=50,
#                                    min_samples_split=4, random_state=1, max_features=966,
#                                    max_depth=None: 0.054250737614
# Logistic: 0.106388517105
# Logistic, penalty='l2', C=4, n_jobs=-1: 0.106625055755
# Logistic, penalty='l1', C=0.9029677391429398, n_jobs=-1: 0.106506785964
# Bagging, max_features=0.07, random_state=1: 0.130285290212
# Bagging, max_features=0.20, random_state=1: 0.0653121753236
# Bagging, max_features=0.50, random_state=1: 0.0560640870545
# Bagging, max_features=0.75, random_state=1: 0.0568524849905

Starting fold #1
 Fold error (#1): 0.05376852727846104
Starting fold #2
 Fold error (#2): 0.05392620624408706
Starting fold #3
 Fold error (#3): 0.05487444317420276
Starting fold #4
 Fold error (#4): 0.054204281152678724
Starting fold #5
 Fold error (#5): 0.05448023022036508
0.054250737614


In [10]:
# model = AdaBoostClassifier(n_estimators=200, learning_rate=0.5).fit(train_data_combo, train_labels)
model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=77,
                                   min_samples_split=2, random_state=1, max_features=771,
                                   max_depth=None).fit(train_data_combo, train_labels)
preds = model.predict(test_data_combo)

with open('results/20160413(rf).csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(preds)+1), preds))

## Ignoring two large columns

Columns 23 and 58 contibute 3,031 and 2,090 categories/columns (total 5,121). Removing them should make training a lot easier for some methods.

In [15]:
# all_data['23'].cat.categories
# all_data['58'].cat.categories
categorical_columns_tiny = ['0','5','7','8','9','14','16','17','18','20','25','26','56','57']

all_data_cat_tiny = pd.get_dummies(all_data[categorical_columns_tiny])
train_data_cat_tiny = all_data_cat_tiny.iloc[0:train_obs,]
test_data_cat_tiny = all_data_cat_tiny.iloc[train_obs:,]

train_data_combo_tiny = pd.concat([train_data_num, train_data_cat_tiny], axis=1)
test_data_combo_tiny = pd.concat([test_data_num, test_data_cat_tiny], axis=1)

In [35]:
random.seed(1)
n_folds = 5
num_train = len(train_data_combo_tiny)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_errors = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo_tiny.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo_tiny.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
#     fold_error = cv_run_neural(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 10)
    fold_error = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     fold_error = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))

# XGBoost, default: 0.113224072446
# kNN, k=1: 0.08194771365
# kNN, k=2: 0.0818609774763
# kNN, k=5: 0.108130910389
# kNN, k=10: 0.124293373591
# Bagging, default: 0.0595804119027
# Bagging, n_estimators=20: 0.0583583592049

Starting fold #1
 Fold error (#1): 0.05845947650583416
Starting fold #2
 Fold error (#2): 0.05905077262693159
Starting fold #3
 Fold error (#3): 0.059486734734103375
Starting fold #4
 Fold error (#4): 0.05649071628493707
Starting fold #5
 Fold error (#5): 0.058304095872590334
0.0583583592049
