# ML Kaggle Competition

In [108]:
import csv
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import AdaBoostClassifier

train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    train[categorical_columns[i]] = train[categorical_columns[i]].astype('category')
    test[categorical_columns[i]] = test[categorical_columns[i]].astype('category')

train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

train_data_num = train_data.drop(categorical_columns, axis=1)
test_data_num = test_data.drop(categorical_columns, axis=1)

## Simple prediction with only numerical variables

In [109]:
def cv_run(train_data, train_labels, test_data, test_labels):    
    model = AdaBoostClassifier().fit(train_data, train_labels)
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    
    return error

num_train = len(train_data_num)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, 10)

cv_error = 0
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_num.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_num.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    fold_error = cv_run(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    cv_error = cv_error + fold_error

cv_error = cv_error/float(10)
print(cv_error)

Starting fold #1
Starting fold #2
Starting fold #3
Starting fold #4
Starting fold #5
Starting fold #6
Starting fold #7
Starting fold #8
Starting fold #9
Starting fold #10


In [111]:
model = AdaBoostClassifier().fit(train_data_num, train_labels)
preds = model.predict(test_data_num)

with open('simple_results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(preds)+1), preds))

## Incorporating Categorical Variable with One Hot Encoding

In [60]:
pd.get_dummies(train_data[categorical_columns])

Unnamed: 0,0_dctc,0_def,0_dem,0_demnum,0_el,0_indef,0_null,0_num,0_numpro,0_poss,...,58_xcomp_prep_until,58_xcomp_prep_with,58_xcomp_prepc_below,58_xcomp_prepc_beneath,58_xcomp_prepc_from,58_xcomp_prepc_of,58_xcomp_prepc_to,58_xcomp_rcmod,58_xcomp_root,58_xcomp_xcomp
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
