In [1]:
!pip install -U -q PyDrive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [3]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [37]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

In [5]:
!mkdir data

In [6]:
file_id = '1KiOvhsdjJqaUCLJa5adZXEtQ_72s8Eb6'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('data/checking-logs.sqlite')

In [7]:
conn = sqlite3.connect('data/checking-logs.sqlite')

In [8]:
df = pd.io.sql.read_sql(
    """
    select uid, labname, numTrials, timestamp
    from checker
    where uid like 'user_%' and status = 'ready'
    """,
    conn, parse_dates=['timestamp']
    )

In [9]:
conn.close()

In [10]:
df['hour'] = df['timestamp'].dt.hour
dayofweek = df['timestamp'].dt.dayofweek
df = df.drop('timestamp', axis=1)

In [11]:
df = pd.get_dummies(df, prefix=['uid', 'labname'], columns=['uid', 'labname'])

In [12]:
df['dayofweek'] = dayofweek

In [13]:
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,uid_user_18,uid_user_19,uid_user_2,uid_user_20,uid_user_21,uid_user_22,uid_user_23,uid_user_24,uid_user_25,uid_user_26,uid_user_27,uid_user_28,uid_user_29,uid_user_3,uid_user_30,uid_user_31,uid_user_4,uid_user_6,uid_user_7,uid_user_8,labname_code_rvw,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
1,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
2,3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
3,4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
4,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('dayofweek', axis=1), df['dayofweek'],
    test_size=0.2, random_state=21, stratify=df['dayofweek']
    )

In [15]:
svc = SVC(probability=True)

In [16]:
param_grid = {'C': [0.01, 0.1, 1, 1.5, 5, 10],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'],
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_svc = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)

In [17]:
cv_svc.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid'],
                         'random_state': [21]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [18]:
cv_svc.best_params_

{'C': 10,
 'class_weight': None,
 'gamma': 'auto',
 'kernel': 'rbf',
 'random_state': 21}

In [19]:
cv_svc.best_score_

0.8761090458488228

In [20]:
cv_svc_results = pd.DataFrame(cv_svc.cv_results_)
cv_svc_results = cv_svc_results.sort_values('rank_test_score', ascending=False)

In [21]:
cv_svc_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,1.342677,0.007338,0.036376,0.002276,1.0,balanced,auto,sigmoid,21,"{'C': 1, 'class_weight': 'balanced', 'gamma': ...",0.066667,0.07037,0.044444,0.063197,0.055762,0.060088,0.009188,72
17,1.435409,0.013446,0.036436,0.001753,0.1,balanced,auto,sigmoid,21,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.05948,0.05948,0.06231,0.002678,71
41,1.338649,0.003645,0.034903,0.00091,1.5,balanced,auto,sigmoid,21,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.07938,0.006913,70
65,1.107973,0.036297,0.034604,0.000615,10.0,balanced,auto,sigmoid,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.12963,0.100372,0.085502,0.115693,0.020052,69
53,1.147234,0.03106,0.034218,0.000885,5.0,balanced,auto,sigmoid,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68


In [22]:
tree = DecisionTreeClassifier()

In [23]:
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_tree = GridSearchCV(tree, param_grid, scoring='accuracy', n_jobs=-1)

In [24]:
cv_tree.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid=...s=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criter

In [25]:
cv_tree.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 27,
 'random_state': 21}

In [26]:
cv_tree.best_score_

0.8723750516315573

In [27]:
cv_tree_results = pd.DataFrame(cv_tree.cv_results_)
cv_tree_results = cv_tree_results.sort_values('rank_test_score',
                                              ascending=False)

In [28]:
cv_tree_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009912,0.004615,0.002047,7.5e-05,balanced,gini,1,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.318519,0.266667,0.32342,0.260223,0.286358,0.028376,195
49,0.007292,0.002577,0.001689,8.2e-05,balanced,entropy,1,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.318519,0.266667,0.32342,0.260223,0.286358,0.028376,195
147,0.005173,0.000221,0.00178,0.000104,,entropy,1,21,"{'class_weight': None, 'criterion': 'entropy',...",0.37037,0.351852,0.359259,0.35316,0.342007,0.35533,0.009338,193
98,0.005262,0.000318,0.001765,7.7e-05,,gini,1,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.37037,0.351852,0.359259,0.35316,0.342007,0.35533,0.009338,193
2,0.009252,0.001881,0.002299,0.00051,balanced,gini,3,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.388889,0.303704,0.403704,0.427509,0.345725,0.373906,0.044064,192


In [29]:
forest = RandomForestClassifier()

In [30]:
param_grid = {'n_estimators': [5, 10, 50, 100],
              'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_forest = GridSearchCV(forest, param_grid, scoring='accuracy', n_jobs=-1)

In [31]:
cv_forest.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=No...
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'en

In [32]:
cv_forest.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50,
 'random_state': 21}

In [33]:
cv_forest.best_score_

0.9042902381935839

In [34]:
cv_forest_results = pd.DataFrame(cv_forest.cv_results_)
cv_forest_results = cv_forest_results.sort_values('rank_test_score',
                                                  ascending=False)

In [35]:
cv_forest_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
196,0.023273,0.005396,0.003734,0.00087,balanced,entropy,1,5,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.292593,0.225926,0.282528,0.289963,0.270794,0.024718,784
0,0.022808,0.00201,0.003892,0.000566,balanced,gini,1,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.292593,0.285185,0.282528,0.29368,0.28339,0.011062,783
4,0.019278,0.00017,0.003199,0.000192,balanced,gini,2,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.311111,0.377778,0.377778,0.35316,0.312268,0.346419,0.029749,782
200,0.022352,0.002623,0.003592,0.000209,balanced,entropy,2,5,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.318519,0.366667,0.381481,0.35316,0.345725,0.35311,0.021165,781
588,0.018728,0.000634,0.003569,0.000174,,entropy,1,5,21,"{'class_weight': None, 'criterion': 'entropy',...",0.355556,0.366667,0.374074,0.345725,0.327138,0.353832,0.016467,780


In [38]:
grid = list(ParameterGrid(param_grid))

In [41]:
data = []

for params in tqdm(grid):
    row = params
    estimator = RandomForestClassifier(**params)
    scores = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1)
    row['mean_accuracy'] = np.mean(scores)
    row['std_accuracy'] = np.std(scores)
    data.append(row)

HBox(children=(FloatProgress(value=0.0, max=784.0), HTML(value='')))




In [42]:
cv_forest_results_1 = pd.DataFrame(data)
cv_forest_results_1 = cv_forest_results_1.sort_values('mean_accuracy',
                                                      ascending=False)

In [43]:
cv_forest_results_1.head()

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
502,,gini,28,50,21,0.90429,0.010961
515,,gini,31,100,21,0.903547,0.01438
543,,gini,38,100,21,0.902806,0.01046
571,,gini,45,100,21,0.902806,0.01046
539,,gini,37,100,21,0.902806,0.01046
