In [1]:
!pip install seaborn 
!pip install sklearn
!pip install statsmodels



# Logistic Regression

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# logistic regression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score 

  from pandas.core import datetools


In [0]:
paths = 'http://bigdata.cs.byu.edu/Lectures/EEGEyeState.csv'
df = pd.read_table(paths,sep=',',encoding='latin1')

# split dependent and independent variables
X = list(df)[:-1]
y = ['eyeDetection']

# split into dataframe
y, X = dmatrices('eyeDetection~AF3+F7+F3+FC5+T7+P7+O1+O2+P8+T8+FC6+F4+F8+AF4',df,return_type='dataframe') 

# create test and train.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=40)

# X_test+y_test
df_c = pd.concat([y_train,X_train], axis=1)
df_s = pd.concat([y_test,X_test], axis=1)
df_f = pd.DataFrame(df_s)

In [4]:
form = smf.glm(formula = "eyeDetection ~ AF3 + F7 + F3 + FC5 + T7 + P7 + O1 + O2 + P8 + T8 + FC6 + F4 + F8 + AF4", data=df_c,family=sm.families.Binomial()).fit()

  t = np.exp(-z)


In [6]:
print(form.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           eyeDetection   No. Observations:                10486
Model:                            GLM   Df Residuals:                    10471
Model Family:                Binomial   Df Model:                           14
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -6686.8
Date:                Mon, 12 Nov 2018   Deviance:                       13374.
Time:                        23:47:10   Pearson chi2:                 1.05e+04
No. Iterations:                    10                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3646      6.209      0.220      0.826     -10.805      13.534
AF3            0.0065      0.002      2.708      0.0

## Model Evaluation

In [7]:
# run evaluations on Test dataset.

form1 = smf.glm(formula = "eyeDetection ~ AF3 + F7 + F3 + FC5 + T7 + P7 + O1 + P8 + FC6 + F4 + AF4", data=df_f,family=sm.families.Binomial()).fit()

  t = np.exp(-z)


In [8]:
# Confusion Matrix
pred_class = (form1.fittedvalues > .5)*1
conf_matr = pd.crosstab(df['eyeDetection'],pred_class, margins = True)
conf_matr

  t = np.exp(-z)


col_0,0,1,All
eyeDetection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1875,601,2476
1,1068,950,2018
All,2943,1551,4494


In [9]:
#sensitivity 
print('Our Sensitivity value is:', conf_matr.iloc[1,1]/conf_matr.iloc[1,2])
#specificity
print('Our Specificity is:',conf_matr.iloc[0,0]/conf_matr.iloc[0,2])
#positive predictive value 
print('Our PPV is:',conf_matr.iloc[1,1]/conf_matr.iloc[2,1])
#negative predictive value 
print('Our NPV is:',conf_matr.iloc[0,0]/conf_matr.iloc[2,0])
#accuracy
print('Our Accuracy is:', (conf_matr.iloc[0,0]+conf_matr.iloc[1,1]) / (conf_matr.iloc[0,0]+conf_matr.iloc[1,1]+conf_matr.iloc[1,0]+conf_matr.iloc[0,1]))


Our Sensitivity value is: 0.4707631318136769
Our Specificity is: 0.7572697899838449
Our PPV is: 0.6125080593165699
Our NPV is: 0.6371049949031601
Our Accuracy is: 0.6286159323542501


In [10]:
# AUC

from sklearn.metrics import roc_auc_score

roc_auc_score(df_f['eyeDetection'], pred_class) # AUC is 61%

0.6140164608987609

## Summary

The overall results of this model display an accuracy of 63%, sensitivity of 37%, and specificity of 75%. This model outperformed the neural network accuracy by over 8%. And specified the model’s capacity to predict when the eye is not open when it truly was not, by over 45%. However, the model’s ability (sensitivity) to detect whether the eye was open when true, was  near-equal to that of the neural network.

These results suggest the model not to be practical for this specific test. Though logistic regression serves as a great model to predict binary predictors (0 and 1), the accuracy of this test and area under the curve indicate moderate predictive capability. Some aspects that may improve the model performance may

The model may be further improved in two ways. 1) Setting cutoff values for logistic regression to categorize eye movement may better capture precise results. However, with a mean value of eyeDetection around .5, varying the cutoff value may not change the model performance in this specific case. 2) Using reported p-values after fitting the model to determine statistically significant effects may also yield better results. From our model, the least ‘important’ indicators (beta values) seem to be O2, O1, and F8 as they seem to be insignificant on the effect of eye detection.
