# Exercises set 4

In [1]:
# SETUP
import pandas as pd
import numpy as np
# import sklearn as sk
from sklearn.model_selection import train_test_split
import statistics

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import metrics, cross_validation

# from scipy import stats
# import matplotlib.pyplot as plt
# import seaborn as sns
# from IPython.display import Markdown, display

# import sklearn as sk
# from sklearn.linear_model import LinearRegression
# import statsmodels.api as sm

# from sklearn import datasets, linear_model, metrics
# from sklearn.model_selection import train_test_split, KFold, LeaveOneOut



## 1. Logistic regression

In [2]:
# 1. Read EEG_data.csv into a DataFrame

df_raw = pd.read_csv('EEG_data.csv')
# df_raw.head()

In [3]:
# 2. Change type of "SubjectID" and "VideoID" to integer

df_raw['SubjectID'] = df_raw.SubjectID.astype(int)
df_raw['VideoID'] = df_raw.VideoID.astype(int)

# View updated types to show the success of the change
print(df_raw.dtypes)

SubjectID               int32
VideoID                 int32
Attention             float64
Mediation             float64
Raw                   float64
Delta                 float64
Theta                 float64
Alpha1                float64
Alpha2                float64
Beta1                 float64
Beta2                 float64
Gamma1                float64
Gamma2                float64
predefinedlabel       float64
user-definedlabeln    float64
dtype: object


In [4]:
# 3. Group by 'SubjectID' and 'VideoID' taking the median

df = df_raw.groupby(['SubjectID','VideoID']).median()
df.head() # show the summarised DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
SubjectID,VideoID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,57.0,53.0,41.0,304329.0,81382.5,13622.0,14065.5,14312.5,33887.5,29328.5,13104.5,0.0,0.0
0,1,47.0,50.0,33.0,578197.0,109448.0,23867.5,16974.5,18187.0,31988.0,29606.5,10958.5,0.0,1.0
0,2,43.5,48.0,7.5,463542.0,96455.0,18521.0,13002.0,14092.5,30222.0,27109.0,10977.5,0.0,1.0
0,3,52.0,53.0,37.0,471965.5,64971.0,15899.5,12748.5,11729.0,33487.5,31548.0,11812.5,0.0,0.0
0,4,53.0,47.0,28.0,301557.0,45817.5,14845.5,10775.5,13702.0,31554.0,28606.0,11745.5,0.0,0.0


In [5]:
# 4. Rename and change type of 'predefinedlabel' and 'user-definedlabeln'

# Rename columns 'predefinedlabel' and 'user-definedlabeln'
df = df.rename(columns={'predefinedlabel': 'ExpectedConfusion', 'user-definedlabeln': 'ReportedConfusion'})

# Change type of "SubjectID" and "VideoID" to integer
df['ExpectedConfusion'] = df.ExpectedConfusion.astype(int)
df['ReportedConfusion'] = df.ReportedConfusion.astype(int)

# View updated types to show the success of the change
print(df.dtypes)

Attention            float64
Mediation            float64
Raw                  float64
Delta                float64
Theta                float64
Alpha1               float64
Alpha2               float64
Beta1                float64
Beta2                float64
Gamma1               float64
Gamma2               float64
ExpectedConfusion      int32
ReportedConfusion      int32
dtype: object


In [6]:
# Explore the response variables 'ExpectedConfusion' and 'ReportedConfusion'
targets = ['ExpectedConfusion','ReportedConfusion']
df[targets].iloc[0:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,ExpectedConfusion,ReportedConfusion
SubjectID,VideoID,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
0,1,0,1
0,2,0,1
0,3,0,0
0,4,0,0
0,5,1,1
0,6,1,1
0,7,1,0
0,8,1,1
0,9,1,0


The 'ExpectedConfusion' is predefined for each video:

Videos 0 - 4 are clear, 'ExpectedConfusion' = 0 

Videos 5 - 9 are confusing, 'ExpectedConfusion' = 1

The 'ReportedConfusion' is given by the student after watching the video.

### E1.

Is there imbalance for 'ExpectedConfusion' and 'ReportedConfusion' ?

As 'ExpectedConfusion' is predefined depending on the video and half of the videos (from 0 to 4) are clear and the other half (from 5 to 9) are tagged as confussion, this outcome variable is perfectly balances, i.e. 50:50

As 'ReportedConfusion' is given by the subjects, it may be umbalanced, let's check it!

In [9]:
# Recheck that 'ExpectedConfusion' is balanced
expected_ones = df['ExpectedConfusion'].sum()
print('There are {} confusing videos out of 100 recordings.'.format(expected_ones))
# Check if 'ReportedConfusion' is balanced
reported_ones = df['ReportedConfusion'].sum()
print('There are {} confusing videos out of 100 recordings.'.format(reported_ones))


There are 50 confusing videos out of 100 recordings.
There are 51 confusing videos out of 100 recordings.


In [10]:
# A useful command to do the same is .value_counts()
df['ReportedConfusion'].value_counts()

1    51
0    49
Name: ReportedConfusion, dtype: int64

The data for ExpectedConfusion is luckly very balanced!

### E2.

In [11]:
# Select the target 
target = 'ReportedConfusion'

# Select the 8 predictors (power bands)
predictors = df.columns[3:11]

In [12]:
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()

# Model is learning the relationship between digits (x_train) and labels (y_train)
logisticRegr.fit(df[predictors], df[target])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Now that we trained our logistic regression, we can make predictions using the testing data to determine the training accuracy.

### Prediction of a single observation

In [13]:
# We take one row from the test set
x_test.iloc[[6]]

NameError: name 'x_test' is not defined

In [None]:
# Predict its outcome
logisticRegr.predict(x_test.iloc[[6]])

In [324]:
# Check its real outcome
y_test.iloc[[6]]

SubjectID  VideoID
1          6          0
Name: ReportedConfusion, dtype: int32

The prediction for this observation is correct!

### Prediction of a all observations

In [326]:
# Predict its outcome
logisticRegr.predict(df[predictors])

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0])

In [327]:
# Check their real outcome
df[target]

SubjectID  VideoID
0          0          0
           1          1
           2          1
           3          0
           4          0
           5          1
           6          1
           7          0
           8          1
           9          0
1          0          0
           1          1
           2          1
           3          1
           4          1
           5          0
           6          0
           7          0
           8          0
           9          0
2          0          0
           1          1
           2          0
           3          0
           4          1
           5          1
           6          0
           7          1
           8          1
           9          0
                     ..
7          0          1
           1          1
           2          0
           3          1
           4          1
           5          1
           6          0
           7          1
           8          0
           9         

Some of them coincide, but not all. With this we can determine the Training Accuracy.

$TrainingAccuracy = \frac{nº of correct predictions}{nº of predictions}$

The fraction of observations that are well-classified will give the training accuracy.

In [338]:
# The command logisticRegr.score(x_test, y_test) computes it for us
accuracy = logisticRegr.score(df[predictors], df[target])
print(accuracy)

0.66


Our model is predicts shit!!!!! :D Because it only predicts correctly 66% (TRAINING ACCURACY)of the observations.

### E3.
Cross-validation

In [359]:
# Function for REPEATED CROSS-VALIDATION

def model_repeatedKfolds(predictors, target, K, n_repeats, model_type, result_type):
    """    
    Parameters
    ----------
    predictors : array (n_observations, n_predictors)

    target : array (n_observations)

    K : int
        number of folds of k-means cross validation

    model_type : str
        - logreg : logistic regression
        - linear : linear discriminant analysis
        - quadratic: quadratic discriminant analysis

    result_type : str
        Displays what kind of results to display
        - train : displays mean training accuracy and standard deviation
        - test : displays mean test accuracy and standard deviation
    """
        
    # split the data in all possible folds
    kf = RepeatedKFold(n_splits=K, n_repeats=n_repeats)
        
    # define model
    if model_type == 'linear':
        model = da.LinearDiscriminantAnalysis()
    elif model_type == 'quadratic':
        model = da.QuadraticDiscriminantAnalysis()
    elif model_type == 'logreg':
        model = LogisticRegression()

    # initialize lists to store results
    train_acc = []
    test_acc = []

    # split the data
    for train_index, test_index in kf.split(X=predictors,y=target):

#       print('TRAIN:', train_index, 'TEST:', test_index)

         # gather training set data
        training_predictors = predictors.iloc[train_index]
        training_targets = target.iloc[train_index]

        # train the data
        train_fit = model.fit(training_predictors,training_targets)

        # score the model on the training data
        train_acc.append(train_fit.score(training_predictors,training_targets))

        # gather test set data
        test_predictors = predictors.iloc[test_index]
        test_targets = target.iloc[test_index]

        # score the model on the test data
        test_acc.append(model.score(test_predictors,test_targets))

    # calculate mean and sd
    mean_train = np.mean(train_acc)
    sd_train = statistics.stdev(train_acc)
    mean_test = np.mean(test_acc)
    sd_test = statistics.stdev(test_acc)

    if result_type == 'on_trainset':
        print('Accuracy of k-means cross validation with {} folds on training data:'.format(K))
        print('\tmean = {}'.format(mean_train))
        print('\tstandard deviation = {}'.format(sd_train))
    if result_type == 'on_testset':
        print('Accuracy of k-means cross validation with {} folds on test data:'.format(K))
        print('\tmean = {}'.format(mean_test))
        print('\tstandard deviation = {}'.format(sd_test))
    if result_type == 'on_both':
        print('Accuracy of k-means cross validation with {} folds on training data:'.format(K))
        print('\tmean = {}'.format(mean_train))
        print('\tstandard deviation = {}'.format(sd_train))
        print('')
        print('Accuracy of k-means cross validation with {} folds on test data:'.format(K))
        print('\tmean = {}'.format(mean_test))
        print('\tstandard deviation = {}'.format(sd_test))

As we have 100 observations and divide them in 10 folds, for each iteration we'll have 90 observations for training and 10 for testing.

Now we are ready to run the logistic regression for each iteration: But WyMing wrote a functions that makes everything for me!

In [363]:
model_repeatedKfolds(predictors=df[predictors], target=df[target], K=10, n_repeats=10, model_type='logreg', result_type='on_trainset')

Accuracy of k-means cross validation with 10 folds on training data:
	mean = 0.6626666666666666
	standard deviation = 0.024695173667018316


In [331]:
from sklearn.cross_validation import cross_val_score

accuracy_iterations = cross_val_score(logisticRegr, X=df[predictors], y=df[target] , cv=10 ,scoring='accuracy')
print ('Accuracy for each iteration: {}'.format(accuracy_iterations))
print ('Mean: {}'.format(accuracy_iterations.mean()))
print ('SD: {}'.format(accuracy_iterations.std()))

Accuracy for each iteration: [0.63636364 0.7        0.7        0.7        0.7        0.5
 0.7        0.6        0.6        0.55555556]
Mean: 0.6391919191919191
SD: 0.06923695356679323


Previously, we just computed the accuracy for one split. Now with cross-validation, we computed it for 10 different possible splittings. Thanks to this we got a more conclusive value for the accuracy, which is the mean of the accuracy of each iterations, 63%.

###  E4.

From hw3 E7, how should the heatmap help me choosing which predictor should I drop?

In [318]:
# Choose the predictors that you want to use
predictors_new = df.columns[3:11]
predictors_new = ['Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
predictors_new

['Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']

In [319]:
accuracy_iterations_new = cross_val_score(logisticRegr, X=df[predictors_new], y=df[target] , cv=10 ,scoring='accuracy')
print ('Accuracy for each iteration: {}'.format(accuracy_iterations_new))
print ('Mean: {}'.format(accuracy_iterations_new.mean()))
print ('SD: {}'.format(accuracy_iterations_new.std()))

Accuracy for each iteration: [0.63636364 0.7        0.7        0.6        0.7        0.4
 0.7        0.6        0.5        0.44444444]
Mean: 0.5980808080808081
SD: 0.10746302244844845


After trying different combinations of predictors, we observed that dropping alpha1 and alpha2 gives a better accuracy:

Dropping alpha1 and alpha2:

Mean: 0.6584848484848485

SD: 0.09372458333148866


When dropping other predictors like Delta, the accuracy gets worse

Dropping delta:

Mean: 0.6173737373737372

SD: 0.12106202828458776







QUESTIONS:

Is it really correct to compute accuracy using the training said? We are asked to do so in E2 because we use all the data for training and then we're not left with any data to test.