# ML Kaggle Competition

In [25]:
import csv
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import AdaBoostClassifier

train = pd.read_csv('data.csv', sep=",")
train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

test_data = pd.read_csv('quiz.csv', sep=",")

all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')

## Simple prediction with only numerical variables

In [26]:
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

def cv_run(train_data, train_labels, test_data, test_labels):    
    model = AdaBoostClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    
    return error

num_train = len(train_data_num)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, 10)

cv_error = 0
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_num.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_num.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    fold_error = cv_run(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    cv_error = cv_error + fold_error

cv_error = cv_error/float(10)
print(cv_error)

model = AdaBoostClassifier().fit(train_data_num, train_labels)
preds = model.predict(test_data_num)

# with open('simple_results.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerow(("Id","Prediction"))
#     writer.writerows(zip(range(1,len(preds)+1), preds))

Starting fold #1
Starting fold #2
Starting fold #3
Starting fold #4
Starting fold #5
Starting fold #6
Starting fold #7
Starting fold #8
Starting fold #9
Starting fold #10
0.269290547594


## Incorporating Categorical Variable with One Hot Encoding

In [27]:
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

In [30]:
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

In [35]:
def cv_run(train_data, train_labels, test_data, test_labels):    
    model = AdaBoostClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    
    return error

n_folds = 5
num_train = len(train_data_combo)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_error = 0
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    fold_error = cv_run(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    cv_error = cv_error + fold_error

cv_error = cv_error/float(n_folds)
print(cv_error)

Starting fold #1
Starting fold #2
Starting fold #3
Starting fold #4
Starting fold #5
0.128519289526


In [36]:
model = AdaBoostClassifier().fit(train_data_combo, train_labels)
preds = model.predict(test_data_combo)

with open('expanded_results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(preds)+1), preds))