In [1]:
!pip install -U -q PyDrive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [3]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [38]:
import os
import joblib
import sqlite3
import pandas as pd
import numpy as np
from copy import deepcopy
from itertools import product
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.exceptions import NotFittedError
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [5]:
!mkdir data

In [6]:
file_id = '1KiOvhsdjJqaUCLJa5adZXEtQ_72s8Eb6'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('data/checking-logs.sqlite')

In [7]:
conn = sqlite3.connect('data/checking-logs.sqlite')

In [8]:
df = pd.io.sql.read_sql(
    """
    select uid, labname, numTrials, timestamp
    from checker
    where uid like 'user_%' and status = 'ready'
    """,
    conn, parse_dates=['timestamp']
    )

In [9]:
conn.close()

In [10]:
df.head()

Unnamed: 0,uid,labname,numTrials,timestamp
0,user_4,project1,1,2020-04-17 05:19:02.744528
1,user_4,project1,2,2020-04-17 05:22:45.549397
2,user_4,project1,3,2020-04-17 05:34:24.422370
3,user_4,project1,4,2020-04-17 05:43:27.773992
4,user_4,project1,5,2020-04-17 05:46:32.275104


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   uid        1686 non-null   object        
 1   labname    1686 non-null   object        
 2   numTrials  1686 non-null   int64         
 3   timestamp  1686 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 52.8+ KB


In [12]:
class FeatureExtractor():
    def transform(self, X):
        X['hour'] = X['timestamp'].dt.hour
        X['weekday'] = X['timestamp'].dt.weekday
        return X.drop('timestamp', axis=1)

In [13]:
class MyOneHotEncoder():
    def transform(self, X, target):
        y = X[target]
        X = X.drop(target, axis=1)
        categorical = list(X.select_dtypes(include='object').columns)
        X = pd.get_dummies(X, prefix=categorical, columns=categorical)
        return X, y

In [63]:
class ModelSelection():
    def __init__(self, estimators, names):
        self.estimators = deepcopy(estimators)
        self.names = deepcopy(names)
        self.cv_results_ = None
        self.best_score_ = None
        self.final_score_ = None
        self.best_estimator_ = None
        self.best_estimator_name_ = None

    def fit(self, X, y):
        self.cv_results_ = {'model': [], 'params': [], 'valid_score': []}
        for i in tqdm(range(len(self.estimators))):
            print(f'Estimator: {self.names[i]}')
            self.estimators[i].fit(X, y)
            train = max(self.estimators[i].cv_results_['mean_train_score'])
            valid = self.estimators[i].best_score_
            self.cv_results_['model'].append(self.names[i])
            self.cv_results_['params'].append(self.estimators[i].best_params_)
            self.cv_results_['valid_score'].append(valid)
            print(f'Best params: {self.estimators[i].best_params_}')
            print(f'Best training accuracy: {train:.3f}')
            print(f'Validation accuracy for best params: {valid:.3f}')
            print('\n')
            if not self.best_score_ or valid >= self.best_score_:
                self.best_score_ = valid
                self.best_estimator_ = self.estimators[i].best_estimator_
                self.best_estimator_name_ = self.names[i]
        print(f'Best estimator: {self.best_estimator_name_}')

    def best_results(self):
        if not self.cv_results_:
            raise NotFittedError('ModelSelection instance is not fitted yet')
        return pd.DataFrame(self.cv_results_).sort_values('valid_score',
                                                          ascending=False)

    def final_score(self, X, y):
        if not self.cv_results_:
            raise NotFittedError('ModelSelection instance is not fitted yet')
        y_pred = self.best_estimator_.predict(X)
        self.final_score_ = accuracy_score(y, y_pred)
        self.best_estimator_name_ += f'_{self.final_score_:.3f}'
        return self.final_score_

    def save_model(self, path):
        if not self.cv_results_:
            raise NotFittedError('ModelSelection instance is not fitted yet')
        joblib.dump(self.best_estimator_,
                    os.path.join(path, f'{self.best_estimator_name_}.joblib'))

In [64]:
class BestEnsemble():
    def __init__(self, estimators, voting_params, stacking_params):
        self.voting = VotingClassifier(deepcopy(estimators))
        self.stacking = StackingClassifier(deepcopy(estimators))
        self.voting_params = deepcopy(voting_params)
        self.stacking_params = deepcopy(stacking_params)
        self.cv_results_ = None
        self.best_score_ = None
        self.final_score_ = None
        self.best_estimator_ = None
        self.best_estimator_name_ = None

    def fit(self, X, y):
        self.cv_results_ = {
            'ensemble': ['Voting Ensemble', 'Stacking Ensemble'],
            'params': [],
            'valid_score': []
            }
        print('Estimator: Voting Ensemble')
        cv = GridSearchCV(self.voting, self.voting_params, scoring='accuracy',
                          n_jobs=-1, verbose=2, return_train_score=True)
        cv.fit(X, y)
        train_score = max(cv.cv_results_['mean_train_score'])
        self.cv_results_['params'].append(cv.best_params_)
        self.cv_results_['valid_score'].append(cv.best_score_)
        print(f'Best params: {cv.best_params_}')
        print(f'Best training accuracy: {train_score:.3f}')
        print(f'Validation accuracy for best params: {cv.best_score_:.3f}\n')
        self.best_score_ = cv.best_score_
        self.best_estimator_ = cv.best_estimator_
        self.best_estimator_name_ = 'Voting Classifier'
        print('Estimator: Stacking Ensemble')
        cv = GridSearchCV(self.stacking, self.stacking_params,
                          scoring='accuracy', n_jobs=-1, verbose=2,
                          return_train_score=True)
        cv.fit(X, y)
        train_score = max(cv.cv_results_['mean_train_score'])
        self.cv_results_['params'].append(cv.best_params_)
        self.cv_results_['valid_score'].append(cv.best_score_)
        print(f'Best params: {cv.best_params_}')
        print(f'Best training accuracy: {train_score:.3f}')
        print(f'Validation accuracy for best params: {cv.best_score_:.3f}\n')
        if cv.best_score_ >= self.best_score_:
            self.best_score_ = cv.best_score_
            self.best_estimator_ = cv.best_estimator_
            self.best_estimator_name_ = 'Stacking Classifier'
        print(f'Best estimator: {self.best_estimator_name_}')

    def best_results(self):
        if not self.cv_results_:
            raise NotFittedError('BestEnsemble instance is not fitted yet')
        return pd.DataFrame(self.cv_results_).sort_values('valid_score',
                                                          ascending=False)

    def final_score(self, X, y):
        if not self.cv_results_:
            raise NotFittedError('BestEnsemble instance is not fitted yet')
        y_pred = self.best_estimator_.predict(X)
        self.final_score_ = accuracy_score(y, y_pred)
        self.best_estimator_name_ += f'_{self.final_score_:.3f}'
        return self.final_score_

    def save_model(self, path):
        if not self.cv_results_:
            raise NotFittedError('BestEnsemble instance is not fitted yet')
        joblib.dump(self.best_estimator_,
                    os.path.join(path, f'{self.best_estimator_name_}.joblib'))

In [18]:
feature_extractor = FeatureExtractor()

df = feature_extractor.transform(df)

In [19]:
df.head()

Unnamed: 0,uid,labname,numTrials,hour,weekday
0,user_4,project1,1,5,4
1,user_4,project1,2,5,4
2,user_4,project1,3,5,4
3,user_4,project1,4,5,4
4,user_4,project1,5,5,4


In [20]:
one_hot = MyOneHotEncoder()

X, y = one_hot.transform(df, 'weekday')

In [21]:
X.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,uid_user_18,uid_user_19,uid_user_2,uid_user_20,uid_user_21,uid_user_22,uid_user_23,uid_user_24,uid_user_25,uid_user_26,uid_user_27,uid_user_28,uid_user_29,uid_user_3,uid_user_30,uid_user_31,uid_user_4,uid_user_6,uid_user_7,uid_user_8,labname_code_rvw,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [22]:
y

0       4
1       4
2       4
3       4
4       4
       ..
1681    3
1682    3
1683    3
1684    3
1685    3
Name: weekday, Length: 1686, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=21, stratify=y)

In [44]:
svc = SVC(probability=True)

param_grid = {'C': [0.01, 0.1, 1, 1.5, 5, 10],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'],
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_svc = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1,
                      verbose=2, return_train_score=True)

In [45]:
tree = DecisionTreeClassifier()

param_grid = {'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_tree = GridSearchCV(tree, param_grid, scoring='accuracy', n_jobs=-1,
                       verbose=2, return_train_score=True)

In [46]:
forest = RandomForestClassifier()

param_grid = {'n_estimators': [5, 10, 50, 100],
              'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}

cv_forest = GridSearchCV(forest, param_grid, scoring='accuracy', n_jobs=-1,
                         verbose=2, return_train_score=True)

In [47]:
model_selection = ModelSelection([cv_svc, cv_tree, cv_forest],
                                 ['SVC', 'Decision Tree', 'Random Forest'])

In [48]:
model_selection.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Estimator: SVC
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 28.4min finished


Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 21}
Best training accuracy: 0.953
Validation accuracy for best params: 0.876


Estimator: Decision Tree
Fitting 5 folds for each of 196 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:    6.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 27, 'random_state': 21}
Best training accuracy: 1.000
Validation accuracy for best params: 0.872


Estimator: Random Forest
Fitting 5 folds for each of 784 candidates, totalling 3920 fits


[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 612 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 1424 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2556 tasks      | elapsed:  3.4min


Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 1.000
Validation accuracy for best params: 0.904



Best estimator: Random Forest


[Parallel(n_jobs=-1)]: Done 3920 out of 3920 | elapsed:  5.2min finished


In [49]:
model_selection.best_results()

Unnamed: 0,model,params,valid_score
2,Random Forest,"{'class_weight': None, 'criterion': 'gini', 'm...",0.90429
0,SVC,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.876109
1,Decision Tree,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872375


In [65]:
svc = SVC(C=10, gamma='auto', probability=True, random_state=21)
tree = DecisionTreeClassifier(max_depth=27, class_weight='balanced',
                              random_state=21)
forest = RandomForestClassifier(n_estimators=50, max_depth=28, random_state=21)

estimators = [('svc', svc), ('tree', tree), ('forest', forest)]

In [66]:
voting_params = {'voting': ['hard', 'soft'],
                 'weights': [None] + list(product(range(1, 6), repeat=3))}

stacking_params = {'cv': [2, 3, 4, 5, 6, 7],
                   'passthrough': [True, False]}

In [67]:
ensemble_selection = BestEnsemble(estimators, voting_params, stacking_params)

In [68]:
ensemble_selection.fit(X_train, y_train)

Estimator: Voting Ensemble
Fitting 5 folds for each of 252 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed: 13.8min finished


Best params: {'voting': 'hard', 'weights': (2, 3, 2)}
Best training accuracy: 1.000
Validation accuracy for best params: 0.907

Estimator: Stacking Ensemble
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.5min finished


Best params: {'cv': 4, 'passthrough': True}
Best training accuracy: 0.999
Validation accuracy for best params: 0.909

Best estimator: Stacking Classifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [69]:
ensemble_selection.best_results()

Unnamed: 0,ensemble,params,valid_score
1,Stacking Ensemble,"{'cv': 4, 'passthrough': True}",0.908751
0,Voting Ensemble,"{'voting': 'hard', 'weights': (2, 3, 2)}",0.906518


In [70]:
ensemble_selection.final_score(X_test, y_test)

0.9319526627218935

In [71]:
ensemble_selection.save_model('data')