In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

import pickle
from drowsiness_detection.data import get_train_test_splits
from drowsiness_detection.helpers import binarize

In [2]:
threshold = 7

# load train and test data
train, test = get_train_test_splits()
print(train.shape, test.shape)

# X still contains NaNs
train = np.nan_to_num(train, nan=-1)
test = np.nan_to_num(test, nan=-1)

# split in full data for CV
full = np.concatenate([train, test])
X = full[:, :-1]
y = full[:, -1]
# binarize y to represent not drowsy vs drowsy
y = binarize(y, threshold)

# X_train, y_train = train[:, :-1], train[:, -1]
# X_test, y_test = test[:, :-1], test[:, -1]
#
# y_train, y_test = binarize(y_train, threshold), binarize(y_test, threshold)

(32834, 68) (16102, 68)


In [10]:

# define models and parameters
model = LogisticRegression()
solvers = ['saga', 'liblinear']
penalty = ['l2', "l1"]
c_values = [100, 10, 1.0, 0.1, 0.01]
max_iters = [1000]
# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c_values, max_iter=max_iters)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X, y)
log_reg_grid_result = grid_result
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.694683 using {'C': 100, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.473251 (0.004809) with: {'C': 100, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
0.691638 (0.004721) with: {'C': 100, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
0.473251 (0.004809) with: {'C': 100, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
0.694683 (0.004664) with: {'C': 100, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.473251 (0.004809) with: {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
0.691338 (0.004178) with: {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
0.473251 (0.004809) with: {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
0.694608 (0.004730) with: {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.473251 (0.004809) with: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
0.691815 (0.004230) with: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'l

In [12]:
with open("log_reg_result.pkl", "wb") as fp:
    pickle.dump(file=fp, obj=log_reg_grid_result)

In [None]:
model = RandomForestClassifier()
# define grid search
param_grid = {'n_estimators': [10, 100, 1000], 'max_depth': [2, 5, 10, None], 'min_samples_leaf': [1, 10, 100, 1000], "max_features": ['sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X, y)
rf_grid_result = grid_result
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [4]:
with open("rf_result.pkl", "wb") as fp:
    pickle.dump(file=fp, obj=rf_grid_result)

NameError: name 'rf_grid_result' is not defined