In [12]:
import csv
import numpy as np
import pandas as pd
import random

#########################
# Load the datasets
train = pd.read_csv('data.csv', sep=",")
train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

test_data = pd.read_csv('quiz.csv', sep=",")

all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in xrange(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')

#########################
# Only numerical data
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

#########################
# Only categorial data
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

#########################
# Ignoring two large columns ('slim')
categorical_columns_slim = ['0','5','7','8','9','14','16','17','18','20','25','26','56','57']

all_data_cat_slim = pd.get_dummies(all_data[categorical_columns_slim])
train_data_cat_slim = all_data_cat_slim.iloc[0:train_obs,]
test_data_cat_slim = all_data_cat_slim.iloc[train_obs:,]

#########################
# Combined sets
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

train_data_combo_slim = pd.concat([train_data_num, train_data_cat_slim], axis=1)
test_data_combo_slim = pd.concat([test_data_num, test_data_cat_slim], axis=1)

In [252]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR
# from xgboost import XGBClassifier

from sklearn.tree import ExtraTreeClassifier

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)    
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.75, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_et(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(max_features=400, n_estimators=20, n_jobs=-1, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_et2(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(max_features=None, n_jobs=-1, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_gb(train_data, train_labels, test_data, test_labels):
    model = GradientBoostingClassifier(loss='exponential', n_estimators=200, max_features=None, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_knn(train_data, train_labels, test_data, test_labels, n_neigh):
    model = KNeighborsClassifier(n_neighbors=n_neigh).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_logistic(train_data, train_labels, test_data, test_labels):
    model = LogisticRegression().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_nb(train_data, train_labels, test_data, test_labels):
    model = GaussianNB().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

# def cv_run_neural(train_data, train_labels, test_data, test_labels):
#     model = MLPClassifier().fit(train_data, train_labels)
#     preds = model.predict(test_data)
#     error = 1 - sum(preds == test_labels)/float(len(test_labels))
#     return preds, error

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(max_features=200, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_rf2(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(max_features=None, random_state=1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_sgd(train_data, train_labels, test_data, test_labels):    
    model = SGDClassifier(loss='perceptron').fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_svm(train_data, train_labels, test_data, test_labels):    
    model = SVR().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

# def cv_run_xg(train_data, train_labels, test_data, test_labels):
#     model = XGBClassifier().fit(train_data, train_labels)
#     preds = model.predict(test_data)
#     error = 1 - sum(preds == test_labels)/float(len(test_labels))
#     return preds, error

In [170]:
#########################
# Cross validation (first stage)
random.seed(1)
n_folds = 5
num_train = len(train_data_combo)
indices = random.sample(xrange(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_preds = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo.iloc[cv_train_index,:]
    cv_train_data_slim = train_data_combo_slim.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo.iloc[cv_test_index,:]
    cv_test_data_slim = train_data_combo_slim.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    preds_1, error_1 = cv_run_et(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_2, error_2 = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
#     preds_3, error_3 = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_4, error_4 = cv_run_logistic(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_5, error_5 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 1)
#     preds_6, error_6 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 2)
#     preds_7, error_7 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 5)
#     preds_8, error_8 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 10)
    
#     fold_preds = pd.concat([preds_1, preds_2, preds_3, preds_4, preds_5, preds_6, preds_7, preds_8], axis=1)
    fold_preds = np.column_stack((preds_1, preds_2, preds_4, preds_5))
    if len(cv_preds) == 0:
        cv_preds = fold_preds
    else:
        cv_preds = np.vstack((cv_preds,fold_preds))
    print("Fold #{} errors:".format(i+1))
    print(error_1, error_2, error_4, error_5)
    print('')

# print(sum(cv_errors)/float(n_folds))

Starting fold #1
Fold #1 errors:
(0.057237464522232773, 0.058577735730053648, 0.10410753705455689, 0.084752444023967199)

Starting fold #2
Fold #2 errors:
(0.056685588142541765, 0.058971933144118527, 0.10726111636707658, 0.08230842005676442)

Starting fold #3
Fold #3 errors:
(0.055859975558796848, 0.057160878306461105, 0.10588559940079634, 0.079867544447510541)

Starting fold #4
Fold #4 errors:
(0.056293609808018341, 0.059526156029487076, 0.10702881696692557, 0.082942405487444359)

Starting fold #5
Fold #5 errors:
(0.056727244057239723, 0.057752197737217625, 0.1092364095084164, 0.08451925730279497)



In [239]:
#########################
# CV Errors
cv_labels = train_labels[indices]
for i in xrange(cv_preds.shape[1]):
    print("Method #{}: {}".format(i, 1 - sum(cv_preds[:,i] == cv_labels)/float(len(cv_labels))))
cv_labels = cv_labels.as_matrix()

Method #0: 0.0565607827369
Method #1: 0.058397786135
Method #2: 0.106703879783
Method #3: 0.0828780245512


In [255]:
from sklearn.cross_validation import KFold

cv_preds_stack = pd.DataFrame(cv_preds)

n = len(cv_labels)
kf = KFold(n, n_folds=5)
cv_errors = []
for i, (train, test) in enumerate(kf):
    cv_train_data = cv_preds_stack.iloc[train,:]
    cv_train_labels = cv_labels[train]
    cv_test_data = cv_preds_stack.iloc[test,:]
    cv_test_labels = cv_labels[test]
    
    print("Starting fold #{}".format(i+1))
    preds_0, error_0 = cv_run_ada(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_1, error_1 = cv_run_et2(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_2, error_2 = cv_run_rf2(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_3, error_3 = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_4, error_4 = cv_run_logistic(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    preds_5, error_5 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 1)
    preds_6, error_6 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 2)
    preds_7, error_7 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 5)
    preds_8, error_8 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 10)
    print(error_0, error_1, error_2, error_3, error_4, error_5, error_6, error_7, error_8)
    cv_errors.append(error_1)
    
print(sum(cv_errors)/float(len(cv_errors)))

Starting fold #1
(0.056054872280037804, 0.055660674865972926, 0.055660674865972926, 0.055778934090192411, 0.056054872280037804, 0.076277199621570535, 0.076158940397350938, 0.057671081677704183, 0.055069378744875386)
Starting fold #2
(0.056843267108167783, 0.05534531693472089, 0.05534531693472089, 0.05534531693472089, 0.055581835383159861, 0.057158625039419708, 0.05861715547146007, 0.055069378744875386, 0.055227057710501404)
Starting fold #3
(0.0551109709465053, 0.053691804312689717, 0.052942799700398169, 0.052942799700398169, 0.053810068198841043, 0.061891433752513159, 0.059053100484881882, 0.053967753380376071, 0.054559072811132592)
Starting fold #4
(0.054953285764970272, 0.055820554263413147, 0.055544605195726682, 0.055544605195726682, 0.054953285764970272, 0.06094532266330277, 0.061024165254070284, 0.056214767217250716, 0.056884929238774751)
Starting fold #5
(0.054637915401900106, 0.054519651515748779, 0.054519651515748779, 0.054519651515748779, 0.054637915401900106, 0.0554263413095

## Train all models for export

In [259]:
print('Model 1')
model = ExtraTreesClassifier(max_features=400, n_estimators=20, n_jobs=-1, 
                             random_state=1).fit(train_data_combo, train_labels)
preds_1 = model.predict(test_data_combo)

Model 1


In [260]:
print('Model 2')
model = RandomForestClassifier(max_features=200, random_state=1).fit(train_data_combo, train_labels)
preds_2 = model.predict(test_data_combo)

Model 2


In [261]:
print('Model 3')
model = LogisticRegression().fit(train_data_combo, train_labels)
preds_3 = model.predict(test_data_combo)

Model 3


In [262]:
print('Model 4')
model = KNeighborsClassifier(n_neighbors=1).fit(train_data_combo_slim, train_labels)
preds_4 = model.predict(test_data_combo_slim)

Model 4


In [276]:
preds = np.column_stack((preds_1, preds_2, preds_3, preds_4))

In [277]:
model = RandomForestClassifier(max_features=None, random_state=1).fit(cv_preds_stack, cv_labels)
results = model.predict(preds)

In [270]:
with open('results_20160401-1(stack).csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(preds)+1), preds))