# Machine Learning 1
# Exercises 4

# Wy Ming Lin

### Loading and cleaning data

Import packages, do some initial data cleaning steps listed on the exercise sheet.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import statsmodels.regression.linear_model as se
import statsmodels.api as sm
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
import statistics
import sklearn.discriminant_analysis as da
from prettytable import PrettyTable
from statsmodels.stats.outliers_influence import variance_inflation_factor

# this is just to get rid of all the future warnings that are irrelevant
# and to make my results easier to see
import warnings
warnings.filterwarnings("ignore")

# load data
datapath = '/Users/wyminglin/Desktop/ML_exercise_data/EEG_data.csv'
rawdata = pd.read_csv(datapath)

# change SubjectID and VideoID to int
rawdata['SubjectID'] = rawdata['SubjectID'].astype('int')
rawdata['VideoID'] = rawdata['VideoID'].astype('int')

# change predefinedlabel to ExpectedConfusion and user-definedlabel to ReportedConfusion
rawdata.columns = rawdata.columns.str.replace('predefinedlabel','ExpectedConfusion')
rawdata.columns = rawdata.columns.str.replace('user-definedlabeln','ReportedConfusion')

# take the median for each SubjectID and VideoID
data = rawdata.copy()
data = data.groupby(['SubjectID','VideoID']).median()

# create list of predictors for later use
predictors = ['Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']

# initiate lists that will hold all the accuracies and SDs for making the table later
means, SDs = [], []

As we will be doing multiple round of cross validations, I will define a function here that will help make the process easier and pain free.

In [2]:
def run_model(predictors,target,n_folds,test_type):
    """
    Function to make running analyses in this exercise easier and more compact.
    A repeated 10-fold cross validation will be used for each exercise, and the option 
    to choose among logistic regression, linear discriminant analysis and 
    quadratic discriminant analysis will be given.
    
    Parameters
    ----------
    predictors : array (n_observations, n_predictors)
    
    target : array (n_observations)
    
    n_folds : int
        number of folds of k-means cross validation
        
    test_type : str
        - log_reg : logistic regression
        - linear : linear discriminant analysis
        - quadratic: quadratic discriminant analysis
        
    Returns
    -------
    test_mean : mean accuracy of the scores of all rounds of tests, rounded to three decimal places
    
    test_sd : standard deviation of the scores of all rounds of tests, rounded to three decimal places
    
    """
    
    # create folds
    kf = sklearn.model_selection.RepeatedKFold(n_splits=n_folds,random_state=None)
    
    # define model
    if test_type == 'linear':
        model = da.LinearDiscriminantAnalysis()
    elif test_type == 'quadratic':
        model = da.QuadraticDiscriminantAnalysis()
    elif test_type == 'logreg':
        model = LogisticRegression()
    
    # create lists to hold results
    train_acc, test_acc = [],[]
    
    # split the data
    for train_index, test_index in kf.split(X=predictors,y=target):
        
        # gather training set data
        training_predictors = predictors.iloc[train_index]
        training_targets = target.iloc[train_index]
        
        # train the data
        train_fit = model.fit(training_predictors,training_targets)
        
        # score the model on the training data
        train_acc.append(train_fit.score(training_predictors,training_targets))
        
        # gather test set data
        test_predictors = predictors.iloc[test_index]
        test_targets = target.iloc[test_index]
        
        # score the model on the test data
        test_acc.append(model.score(test_predictors,test_targets))
        
    # calculate mean and sd
    mean_test = np.mean(test_acc)
    sd_test = statistics.stdev(test_acc)
    
    print('Accuracy of repeated {}-folds cross validation on test data:'.format(n_folds))
    print('\tmean = {}'.format(mean_test))
    print('\tstandard deviation = {}'.format(sd_test))
    

    return round(mean_test,3), round(sd_test,3)

### 1. Logistic regression

#### E1. Is there class imbalance for outcome variables ReportedConfusion and ExpectedConfusion? In the rest of the exercises, always use ReportedConfusion as outcome variable.

In [3]:
print(data['ExpectedConfusion'].value_counts())
print('')
print(data['ReportedConfusion'].value_counts())

1.0    50
0.0    50
Name: ExpectedConfusion, dtype: int64

1.0    51
0.0    49
Name: ReportedConfusion, dtype: int64


For both ExpectedConfusion and ReportedConfusion, which are both binary variables that can take either the value 0.0 or 1.0. There is roughly a 50 percent chance to get either one of both values. This shows that both variables are balanced. 

#### E2. Run logistic regression with all 100 observations of the 8 continuous predictors, columns 6-13 of the data (Delta to Gamma2). Report training accuracy, i.e. the fraction of observations that are correctly classified. 

In this exercise, a logistic regression model will be trained on all 100 observations in the training data and will then be tested on the training data.

In [4]:
# initiate the logistic regression model
model = LogisticRegression()

# define the data (data and predictor variables)
x_data = data[predictors]
y_data = data['ReportedConfusion']

# train model on the 100 observations
model.fit(x_data,y_data)

# score how well the classifier trained on the data
E2_score = model.score(x_data,y_data)

# append score to means list
means.append(E2_score)
SDs.append('') # dummy to hold the spot (this is necessary for later when I make the table)

print('Mean training accuracy: {}'.format(E2_score))

Mean training accuracy: 0.66


#### E3. Run logistic regression with all 100 of the 8 continuous predictors, but use repeated 10-fold cross validation to calculate the prediction accuracy on the test set. Report both the mean and the standard deviation of the prediction accuracy.

In [5]:
# run logistic regression model with a repeated 10 folds cross validation

E3_test_mean, E3_test_sd = run_model(x_data,y_data,10,'logreg')

#  append score/sd on means and SDs lists
means.append(E3_test_mean)
SDs.append(E3_test_sd)

Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.5990000000000001
	standard deviation = 0.14248515783063231


The mean accuracy for predicting the test data lowered compared to how it did on the training data.

#### E4. Can you improve the prediction accuracy on the test set by reducing the number of predictors? You can get some ideas about which predictors to pick from your analysis of predictor correction in E7 of exercises set 3. Report both the mean and the standard deviation of the prediction accuracy. 

Based on the results of E7 of exercise set 3, we see the following patterns:
- Delta, Theta, Alpha1 and Alpha2 are highly correlated with each other
- Beta2 and Gamma1 are highly correlated with each other
- Beta1 and Gamma2 are stand alone and are not correlated much with the other frequency bands 

We will therefore include Beta1 and Gamma2 in the model as well as Theta and Beta2 in the model (Theta and Beta2 being representative frequency bands from their highly correlated blocks in the correlation matrix) and run the logistic regression model again based on this reduced set of predictors.

In [6]:
# define the new predictors (Theta / Gamma1) that will be used in the reduced logistic regression
predictors_reduced = ['Theta','Beta1','Beta2','Gamma2']
x_reduced = data[predictors_reduced]

# run logistic regression model with a repeated 10-folds cross validation
E4_test_mean, E4_test_sd = run_model(x_reduced,y_data,10,'logreg')

# append score/sd to means and SDs lists
means.append(E4_test_mean)
SDs.append(E4_test_sd)

Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.612
	standard deviation = 0.14652886087971567


When we take into consideration only Theta, Beta1, Beta2 and Gamma2, we see a slight increase in the mean accuracy of the logistic regression using a repeated 10 folds k-means cross validation when compared to the mean accuracy from E2 (testing with test data).

#### E5. Does skewness correction of the predictors (as you did in E6 of set 3) improve the prediction accuracy on the test set? Run logistic regression with all 100 observations of the 8 continuous predictors, and use repeated 10-fold cross validation to calculate the prediction acccuracy of the test set. Report both the mean and the standard deviation of the prediction accuracy. 

Based on the results from exercise set 3, we will only correct all but the Delta band for skewness.

In [7]:
# create the new data frame
data_corrected = data.copy()
for fband in predictors[1:]:
    bc,_ = stats.boxcox(data[fband])
    data_corrected[fband] = bc
    
x_corrected = data_corrected[predictors]

E5_test_mean, E5_test_sd = run_model(x_corrected,y_data,10,'logreg')
means.append(E5_test_mean)
SDs.append(E5_test_sd)

Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.6120000000000001
	standard deviation = 0.13727506854649335


Skewness correction for all predictors except for Delta does not do much to increase the accuracy when compared to the result from E2 (testing on test data) or E3 (testing on a fit with a reduced number of predictors).

### 2. Linear and quadratic discriminant analysis

#### E6. Run both linear and quadratic discriminant analysis with all 100 observations of the 8 continuous predictors and calculate predictive accuracy. Comment on the performance differences between linear and quadratic discriminant and between logistic regression and linear discriminant analysis. Given the mean and standard deviation of the prediction accuracy, which method would you prefer?

In [8]:
# first split up into training and test data, use k-means cv with 10 folds

# linear discriminant analysis
print('Linear Discriminant Analysis')
E6lin_test_mean, E6lin_test_sd = run_model(x_data,y_data,10,'linear')
means.append(E6lin_test_mean)
SDs.append(E6lin_test_sd)
print('')
print('Quadratic Linear Analysis')
# quadratic discriminant analysis
E6quad_test_mean, E6quad_test_sd = run_model(x_data,y_data,10,'quadratic')
means.append(E6quad_test_mean)
SDs.append(E6quad_test_sd)

Linear Discriminant Analysis
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.605
	standard deviation = 0.15851265762160013

Quadratic Linear Analysis
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.6150000000000001
	standard deviation = 0.15658927811380324


The linear and quadratic discriminant analysis models don't really improve the results compared to the logistic regression models run in the previous exercises. It is interesting to note that the mean accuracy of the linear discriminant analysis is slightly lower quadratic linear analysis. 

This observation shows, that the variability of the observations within each class don't differ much. Consequently quadratic discriminant analysis doesn't provide more accurate classification boundaries. Since quadratic discriminant analysis provides a non-linear quadratic decision boundary, quadratic discriminant analysis only then give better results, when the decision boundary is moderately non-linear. This might not be the case here.

#### E7. Does skewness correction of the predictors (as you did in E6 of set 3) improve the prediction accuracy on the test set? Run both linear and quadratic discriminant analyses with all 100 observations of the 8 continuous predictors, and use repeated 10-fold cross-validation to calculate the prediction accuracy on the test set. Report both the mean and the standard deviation of the prediction accuracy.

#### I would also want you to run a generalized mixed-effects model (GLMM) but Python does not have a direct implementation in StatsModels (it does have the fancier Bayesian generalization of GLMMs but that is too advanced for where we are now in the course). Running a GLMM in R, I found that a mixed-effects logistic regression improved predictive accuracy to 0.875. Only the per subject intercept contributed to this increase in performance. We can obtain the same benefit by preprocessing the data and subtract the mean power for each subject in each frequency band. Use this "baseline corrected" power to do the following exercises.

In [9]:
print('Linear Discriminant Analysis')
E7lin_test_mean, E7lin_test_sd = run_model(x_corrected,y_data,10,'linear')
means.append(E7lin_test_mean)
SDs.append(E7lin_test_sd)
print('')
print('Quadratic Discriminant Analysis')
E7quad_test_mean, E7quad_test_sd = run_model(x_corrected,y_data,10,'quadratic')
means.append(E7quad_test_mean)
SDs.append(E7quad_test_sd)

Linear Discriminant Analysis
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.6709999999999999
	standard deviation = 0.13355663119955422

Quadratic Discriminant Analysis
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.499
	standard deviation = 0.16544705277739846


When corrected for skewness, the accuracy for linear discriminant analysis model improves a little bit but accuracy for the quadratic discriminant analysis model drops to about chance level. You can't correct the low accuracy with skewness correction, because it is due to other factors, other than just skewed test data.

### 3. Better preprocessing makes a better classifier

Baseline correcting will be done by subtracting the mean power for each frequency band in each subject as described previously.

In [10]:
data_bc_raw = data.copy()
data_bc = pd.DataFrame()

# apply baseline correction
for (SubjectID), subjectdata in data_bc_raw.groupby(('SubjectID')): # first by subject
    for i in predictors:
        mean = np.mean(subjectdata[i]) # then by predictor
        subjectdata[i] = subjectdata[i] - mean
    data_bc = pd.concat([data_bc,subjectdata],ignore_index=True)

x_data_bc = data_bc[predictors]

#### E8. Using the baseline corrected EEG power, rerun the analysis with (1) logistic repression, (2) linear, and (3) quadratic discriminant analysis. 

In [11]:
print('Logistic regression on baseline corrected data')
E8log_test_mean, E8log_test_sd = run_model(x_data_bc,y_data,10,'logreg')
means.append(E8log_test_mean)
SDs.append(E8log_test_sd)
print('')
print('Linear discriminant analysis on baseline corrected data')
E8lin_test_mean, E8lin_test_sd = run_model(x_data_bc,y_data,10,'linear')
means.append(E8lin_test_mean)
SDs.append(E8lin_test_sd)
print('')
print('Quadratic discriminant analysis on baseline corrected data')
E8quad_test_mean, E8quad_test_sd = run_model(x_data_bc,y_data,10,'quadratic')
means.append(E8quad_test_mean)
SDs.append(E8quad_test_sd)

Logistic regression on baseline corrected data
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.8810000000000001
	standard deviation = 0.10607315943024961

Linear discriminant analysis on baseline corrected data
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.8910000000000001
	standard deviation = 0.0911154224158578

Quadratic discriminant analysis on baseline corrected data
Accuracy of repeated 10-folds cross validation on test data:
	mean = 0.7850000000000001
	standard deviation = 0.12175069742400792


As seen in the results above, mean-subtracted baseline correction improves the accuracy provided by the three models. The logistic regression and linear discriminant analysis models show accuracies in the high 80% whereas the quadratic discriminant analysis model gives a considerably lower accuracy. This is again due to few differences in the variability of the observations within each class. For quadratic discriminant analysis predictor variables are not assumed to have common variance. Since quadratic discriminant analysis provides a non-linear quadratic decision boundary, quadratic discriminant analysis may give better results, when the decision boundary is moderately non-linear. This might not be the case here.

#### E9. Report the summary of your analysis in a formatted table, similar to the one I got (I did some extra things, that you are of course free to try too). 

In [12]:
# PrettyTable will be used to make the formatted table

# column names
col_names = ['Exercise','Model', 'Predictors', 'PredAcc', 'PredSD']

# create a list with all the exercises, models and predictors
exercises = ['E2','E3','E4','E5','E6','E6','E7','E7','E8','E8','E8']

models_used = ['logistic regression training',
               'logistic regression 10 fold cv',
               'logistic regression 10 fold cv',
               'logistic regression 10 fold cv',
               'linear discriminant analysis',
               'quadratic discriminant analysis',
               'linear discriminant analysis',
               'quadratic discriminant analysis',
               'logistic regression',
               'linear discriminant analysis',
               'quadratic discriminant analysis']

predictors_used = ['all 8 EEG power bands', 
                   'all 8 EEG power bands',
                   'Theta, Beta1, Beta2 and Gamma1 power bands',
                   'all 8 EEG power bands skewness-corrected',
                   'all 8 EEG power bands',
                   'all 8 EEG power bands',
                   'all 8 EEG power bands skewness-corrected',
                   'all 8 EEG power bands skewness-corrected',
                   'all 8 EEG power bands mean subtracted',
                   'all 8 EEG power bands mean subtracted',
                   'all 8 EEG power bands mean subtracted']

# build table using the means and SDs lists created throughout the script
resultsTable = PrettyTable()
resultsTable.field_names = col_names
for i in range(len(means)):
    resultsTable.add_row([exercises[i],models_used[i],predictors_used[i],means[i],SDs[i]])
    
# print all results!
print('For best results, view in full screen\n')
print(resultsTable)

For best results, view in full screen

+----------+---------------------------------+--------------------------------------------+---------+--------+
| Exercise |              Model              |                 Predictors                 | PredAcc | PredSD |
+----------+---------------------------------+--------------------------------------------+---------+--------+
|    E2    |   logistic regression training  |           all 8 EEG power bands            |   0.66  |        |
|    E3    |  logistic regression 10 fold cv |           all 8 EEG power bands            |  0.599  | 0.142  |
|    E4    |  logistic regression 10 fold cv | Theta, Beta1, Beta2 and Gamma1 power bands |  0.612  | 0.147  |
|    E5    |  logistic regression 10 fold cv |  all 8 EEG power bands skewness-corrected  |  0.612  | 0.137  |
|    E6    |   linear discriminant analysis  |           all 8 EEG power bands            |  0.605  | 0.159  |
|    E6    | quadratic discriminant analysis |           all 8 EEG power 