In [5]:
import numpy as np
from random import choice
from drowsiness_detection import config
from drowsiness_detection.data import get_train_test_splits, feature_array_to_df, FEATURE_NAMES
from drowsiness_detection.helpers import binarize
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from prettytable import PrettyTable

In [20]:
def accuracy_vanilla_logistic_regression(X_train, y_train, X_test, y_test):
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    return np.mean(y_pred == y_test)

In [21]:
# load train and test data
train, test = get_train_test_splits()
print(train.shape, test.shape)
X_train, y_train = train[:, :-1], train[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

# binarize y to represent not drowsy vs drowsy
threshold = 7
y_train, y_test = binarize(y_train, threshold), binarize(y_test, threshold)

(32834, 68) (16102, 68)


In [22]:
table = PrettyTable()
table.add_rows(zip(FEATURE_NAMES,np.isnan(X_train).sum(axis=0)))
print(table)

+--------------------------------------------+---------+
|                  Field 1                   | Field 2 |
+--------------------------------------------+---------+
|                     LC                     |    0    |
|                 _mean_IED                  |   8179  |
|                  _std_IED                  |   8179  |
|                _median_IED                 |   8179  |
|                 _skew_IED                  |   8179  |
|               _kurtosis_IED                |   8179  |
|                    TEC                     |    0    |
|                 _mean_BTD                  |   8115  |
|                  _std_BTD                  |   8115  |
|                _median_BTD                 |   8115  |
|                 _skew_BTD                  |   8115  |
|               _kurtosis_BTD                |   8115  |
|                 _mean_-AVR                 |   8342  |
|                 _std_-AVR                  |   8342  |
|                _median_-AVR  

In [23]:
# imputing -1
X_train_minus1 = np.nan_to_num(X_train, nan=-1)
X_test_minus1 = np.nan_to_num(X_test, nan=-1)
accuracy_vanilla_logistic_regression(X_train_minus1, y_train, X_test_minus1, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6766240218606384

In [24]:
# imputing mean
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
X_train_mean = imp_mean.transform(X_train)
X_test_mean = imp_mean.transform(X_test)
accuracy_vanilla_logistic_regression(X_train_mean, y_train, X_test_mean, y_test)

0.6833312631971183