In [1]:
import csv
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")

train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

train_data, holdout_data, train_labels, holdout_label = train_test_split(train_data, train_labels, 
                                                                         test_size=0.2, random_state=1)
all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')
    
# Only numerical data
print('Processing numerical data...')
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

# Only categorical data
print('Processing categorical data...')
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

# Combine the two
print('Combining data...')
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

print('Processing finished.')

Processing numerical data...
Processing categorical data...
Combining data...
Processing finished.


In [None]:
params = {
    'n_estimators': [20, 40, 60],
    'max_features': [200, 800, 1600],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 3],
    'random_state': [1],
    'n_jobs': [-1]
}
classifier = ExtraTreesClassifier()
GridLogiClassifier = GridSearchCV(estimator = classifier, param_grid=params, n_jobs=1, cv=4, verbose=2)
GridLogiClassifier.fit(train_data_combo, train_labels)

Fitting 4 folds for each of 108 candidates, totalling 432 fits
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=20, min_samples_split=2, random_state=1, max_features=200, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=20, min_samples_split=2, random_state=1, max_features=200, max_depth=None -  54.9s
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=20, min_samples_split=2, random_state=1, max_features=200, max_depth=None 