In [1]:
import csv
import numpy as np
import pandas as pd
import random

from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")

train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

train_data, holdout_data, train_labels, holdout_label = train_test_split(train_data, train_labels, 
                                                                         test_size=0.2, random_state=1)
all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')
    
# Only numerical data
print('Processing numerical data...')
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

# Only categorical data
print('Processing categorical data...')
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

# Combine the two
print('Combining data...')
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

print('Processing finished!')

Processing numerical data...
Processing categorical data...
Combining data...
Processing finished.


In [3]:
params = {
    'n_estimators': sp_randint(20, 100),
    'max_features': sp_randint(200, 3000),
    'max_depth': [None, 10, 20],
    'min_samples_split': sp_randint(2, 5),
    'min_samples_leaf': sp_randint(1, 5),
    'random_state': [1],
    'n_jobs': [-1]
}
classifier = ExtraTreesClassifier()
search_results = RandomizedSearchCV(estimator = classifier,
                                        param_distributions = params,
                                        n_iter = 50, n_jobs = 1,
                                        cv = 3, verbose = 3 )
# GridLogiClassifier = GridSearchCV(estimator = classifier, param_grid=params, n_jobs=1, cv=4, verbose=2)
search_results.fit(train_data_combo, train_labels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20, score=0.923102 - 4.9min
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20, score=0.924253 - 4.0min
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20 
[CV]  n_jobs=-1, min_samples_leaf=4, n_estimators=68, min_samples_split=2, random_state=1, max_features=2629, max_depth=20, score=0.925611 - 4.0min
[CV] n_jobs=-1, min_samples_leaf=4, n_estimators=45, min_samples_split=2, random_state=1, max_features=2456, max_depth=20 
[C

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 82.1min
[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed: 314.1min



[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=44, min_samples_split=4, random_state=1, max_features=1924, max_depth=10 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=44, min_samples_split=4, random_state=1, max_features=1924, max_depth=10, score=0.887296 - 1.4min
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=44, min_samples_split=4, random_state=1, max_features=1924, max_depth=10 
[CV]  n_jobs=-1, min_samples_leaf=1, n_estimators=44, min_samples_split=4, random_state=1, max_features=1924, max_depth=10, score=0.886346 - 1.5min
[CV] n_jobs=-1, min_samples_leaf=3, n_estimators=51, min_samples_split=4, random_state=1, max_features=1226, max_depth=20 
[CV]  n_jobs=-1, min_samples_leaf=3, n_estimators=51, min_samples_split=4, random_state=1, max_features=1226, max_depth=20, score=0.921358 - 1.9min
[CV] n_jobs=-1, min_samples_leaf=3, n_estimators=51, min_samples_split=4, random_state=1, max_features=1226, max_depth=20 
[CV]  n_jobs=-1, min_samples_leaf=3, n_estimators=51, min_sampl

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 361.3min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'n_jobs': [-1], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x104c26d10>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10978b890>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x109f67410>, 'random_state': [1], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x104c34750>, 'max_depth': [None, 10, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, 

In [5]:
search_results.best_score_

0.94095733672353132

In [6]:
search_results.grid_scores_

[mean: 0.92432, std: 0.00103, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 68, 'min_samples_split': 2, 'random_state': 1, 'max_features': 2629, 'max_depth': 20},
 mean: 0.92386, std: 0.00149, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 45, 'min_samples_split': 2, 'random_state': 1, 'max_features': 2456, 'max_depth': 20},
 mean: 0.93975, std: 0.00134, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 95, 'min_samples_split': 4, 'random_state': 1, 'max_features': 1770, 'max_depth': None},
 mean: 0.93868, std: 0.00133, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 23, 'min_samples_split': 3, 'random_state': 1, 'max_features': 1371, 'max_depth': None},
 mean: 0.88300, std: 0.00219, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 26, 'min_samples_split': 3, 'random_state': 1, 'max_features': 1906, 'max_depth': 10},
 mean: 0.94083, std: 0.00082, params: {'n_jobs': -1, 'min_samples_leaf': 1, 'n_estimators': 40, 'min_