In [1]:
import csv
import numpy as np
import pandas as pd
import random

from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")

train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

train_data, holdout_data, train_labels, holdout_label = train_test_split(train_data, train_labels, 
                                                                         test_size=0.2, random_state=1)
all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')
    
# Only numerical data
print('Processing numerical data...')
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

## Feature engineering
train_data_num['div'] = (train_data_num.loc[:,'60'] / train_data_num.loc[:,'59'])
train_data_num['div'] = train_data_num['div'].fillna(0)
test_data_num['div'] = (test_data_num.loc[:,'60'] / test_data_num.loc[:,'59'])
test_data_num['div'] = test_data_num['div'].fillna(0)

# Only categorical data
print('Processing categorical data...')
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

# Combine the two
print('Combining data...')
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

print('Processing finished!')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Processing numerical data...
Processing categorical data...
Combining data...
Processing finished!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
params = {
    'n_estimators': sp_randint(35, 100),
    'max_features': sp_randint(750, 3000),
    'max_depth': [None],
    'min_samples_split': sp_randint(2, 5),
    'min_samples_leaf': sp_randint(1, 5),
    'random_state': [1],
    'n_jobs': [-1]
}
classifier = RandomForestClassifier()
search_results3 = RandomizedSearchCV(estimator = classifier,
                                        param_distributions = params,
                                        n_iter = 30, n_jobs = 1,
                                        cv = 3, verbose = 3 )
# GridLogiClassifier = GridSearchCV(estimator = classifier, param_grid=params, n_jobs=1, cv=4, verbose=2)
search_results3.fit(train_data_combo, train_labels)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None, score=0.937382 - 2.2min
[CV] n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None, score=0.938976 - 2.0min
[CV] n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None 
[CV]  n_jobs=-1, min_samples_leaf=2, n_estimators=53, min_samples_split=2, random_state=1, max_features=2183, max_depth=None, score=0.940423 - 2.0min
[CV] n_jobs=-1, min_samples_leaf=1, n_estimators=43, min_samples_split=4, random_state=1, max_features=2962, max_d

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 71.9min
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 194.3min finished





RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'n_jobs': [-1], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11e3060d0>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11e30bc10>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11e306050>, 'random_state': [1], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11e30be10>, 'max_depth': [None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scori

## With first engineered feature

In [5]:
search_results3.best_score_

0.94214981915658969

In [7]:
search_results3.best_params_

{'max_depth': None,
 'max_features': 783,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 66,
 'n_jobs': -1,
 'random_state': 1}

In [6]:
search_results3.grid_scores_

[mean: 0.93893, std: 0.00124, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 53, 'min_samples_split': 2, 'random_state': 1, 'max_features': 2183, 'max_depth': None},
 mean: 0.93964, std: 0.00079, params: {'n_jobs': -1, 'min_samples_leaf': 1, 'n_estimators': 43, 'min_samples_split': 4, 'random_state': 1, 'max_features': 2962, 'max_depth': None},
 mean: 0.93538, std: 0.00096, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 76, 'min_samples_split': 4, 'random_state': 1, 'max_features': 1201, 'max_depth': None},
 mean: 0.94015, std: 0.00084, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 97, 'min_samples_split': 4, 'random_state': 1, 'max_features': 913, 'max_depth': None},
 mean: 0.93971, std: 0.00116, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 99, 'min_samples_split': 3, 'random_state': 1, 'max_features': 1950, 'max_depth': None},
 mean: 0.93495, std: 0.00090, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 44, 

## Finer second run

In [4]:
search_results2.best_score_

0.94262287003912526

In [5]:
search_results2.grid_scores_

[mean: 0.94025, std: 0.00196, params: {'n_jobs': -1, 'min_samples_leaf': 1, 'n_estimators': 51, 'min_samples_split': 4, 'random_state': 1, 'max_features': 2832, 'max_depth': None},
 mean: 0.94141, std: 0.00177, params: {'n_jobs': -1, 'min_samples_leaf': 1, 'n_estimators': 56, 'min_samples_split': 4, 'random_state': 1, 'max_features': 2075, 'max_depth': None},
 mean: 0.93812, std: 0.00177, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 45, 'min_samples_split': 4, 'random_state': 1, 'max_features': 1227, 'max_depth': None},
 mean: 0.93822, std: 0.00192, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 66, 'min_samples_split': 3, 'random_state': 1, 'max_features': 1465, 'max_depth': None},
 mean: 0.93503, std: 0.00226, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 51, 'min_samples_split': 4, 'random_state': 1, 'max_features': 2954, 'max_depth': None},
 mean: 0.94000, std: 0.00155, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 68,

## Coarse first run

In [4]:
search_results.best_score_

0.94149937419310337

In [5]:
search_results.grid_scores_

[mean: 0.93670, std: 0.00145, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 68, 'min_samples_split': 2, 'random_state': 1, 'max_features': 508, 'max_depth': None},
 mean: 0.92415, std: 0.00291, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 20, 'min_samples_split': 2, 'random_state': 1, 'max_features': 1141, 'max_depth': 20},
 mean: 0.92504, std: 0.00128, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 96, 'min_samples_split': 4, 'random_state': 1, 'max_features': 1680, 'max_depth': 20},
 mean: 0.92342, std: 0.00102, params: {'n_jobs': -1, 'min_samples_leaf': 4, 'n_estimators': 39, 'min_samples_split': 2, 'random_state': 1, 'max_features': 2042, 'max_depth': 20},
 mean: 0.93903, std: 0.00162, params: {'n_jobs': -1, 'min_samples_leaf': 3, 'n_estimators': 45, 'min_samples_split': 2, 'random_state': 1, 'max_features': 2462, 'max_depth': None},
 mean: 0.94040, std: 0.00086, params: {'n_jobs': -1, 'min_samples_leaf': 2, 'n_estimators': 72, 'min_s