# Import libraries

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import time
import random
import gc

from pandas import *

# SK-learn libraries.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import re

# scipy
from scipy.signal import butter, lfilter


from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.datasets import make_classification




# Import and Merge The Data

###Change the path variables to run on your machine

In [32]:
Main_Path = r'C:\Users\marks\Google Drive\EEG Kaggle\train'
Official_Test_Data_Path = r'C:\Users\marks\Google Drive\EEG Kaggle\test'

###Set which subjects and series to use 
create a string to be put into a regular expression which extracts the data for the specified subjects and series numbers for training and testing. 
'[1-2]' =  1 and 2,   
'[1,2]' = 1 and 2,    
'[1-3]' = 1 THRU 3,    
'[1,3]' = 1 AND 3,    
'[4]' = 4, etc.   

In [None]:
subjects_to_use = '[1-2]'
series_for_training = '[1-4]'
series_for_testing = '[5]'

### Import and merge the specified data

In [None]:
#Remove the channels we don't want 
def Remove_Channels(df):
    #df.drop(df.columns[[15,16,20,21,22,25,26,27,28,29,30,31,32]], axis=1, inplace=True)
    return df


Overall_Start_Time = time.time() #start the overall timer 


Training_Data_RegEx_str = "/subj" + subjects_to_use + "_series"+series_for_training + "*data.csv"
Training_Events_RegEx_str = "/subj" + subjects_to_use + "_series"+series_for_training + "*events.csv"

Testing_Data_RegEx_str = "/subj" + subjects_to_use + "_series"+series_for_testing + "*data.csv"
Testing_Events_RegEx_str = "/subj" + subjects_to_use + "_series"+series_for_testing + "*events.csv"


train_data_filenames = glob.glob(Main_Path + Training_Data_RegEx_str)  #load the subjects and series defined above
list_ = []
Train_Array_Lengths = []
Start_Time = time.time()
print time.time(), "Merging..."

for file_ in train_data_filenames:
    print time.time(), file_
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(Remove_Channels(df))
    Train_Array_Lengths.append(len(df))
train_data = pd.concat(list_)

print "Merge Complete:", time.time()-Start_Time, "total seconds"




#now with event data
train_event_filenames = glob.glob(Main_Path + Training_Events_RegEx_str)  #load the subjects and series defined above
list_ = []

Start_Time = time.time()
print time.time(), "Merging..."

for file_ in train_event_filenames:
    print time.time(), file_
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
train_events = pd.concat(list_)

print "Merge Complete:", time.time()-Start_Time, "total seconds"




#do the same thing and now get your testing data
test_data_filenames = glob.glob(Main_Path + Testing_Data_RegEx_str)  #load the subjects and series defined above
list_ = []
Test_Array_Lengths = []
Start_Time = time.time()
print time.time(), "Merging..."

for file_ in test_data_filenames:
    print time.time(), file_
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(Remove_Channels(df))
    Test_Array_Lengths.append(len(df))
test_data = pd.concat(list_)

print "Merge Complete:", time.time()-Start_Time, "total seconds"




#now with event data
test_event_filenames = glob.glob(Main_Path + Testing_Events_RegEx_str) #load the subjects and series defined above
list_ = []

Start_Time = time.time()
print time.time(), "Merging..."

for file_ in test_event_filenames:
    print time.time(), file_
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
test_events = pd.concat(list_)

print "Merge Complete:", time.time()-Start_Time, "total seconds"

# Separate the data into the 6 events and into training and test sets

In [None]:
Train_Labels_HandStart =  train_events['HandStart'].to_sparse(fill_value=0)
Train_Labels_FirstDigitTouch =  train_events['FirstDigitTouch'].to_sparse(fill_value=0)
Train_Labels_BothStartLoadPhase =  train_events['BothStartLoadPhase'].to_sparse(fill_value=0)
Train_Labels_LiftOff =  train_events['LiftOff'].to_sparse(fill_value=0)
Train_Labels_Replace =  train_events['Replace'].to_sparse(fill_value=0)
Train_Labels_BothReleased =  train_events['BothReleased'].to_sparse(fill_value=0)
# Train_Labels_Combined = train_events.HandStart.map(str) + ',' + train_events.FirstDigitTouch.map(str) + ',' + train_events.BothStartLoadPhase.map(str) + ',' + train_events.LiftOff.map(str) + ',' + train_events.Replace.map(str) + ',' + train_events.BothReleased.map(str)


Train_Data = train_data.iloc[0:,1:].astype('float32') # select only column data


Test_Labels_HandStart =  test_events['HandStart'].to_sparse(fill_value=0)
Test_Labels_FirstDigitTouch =  test_events['FirstDigitTouch'].to_sparse(fill_value=0)
Test_Labels_BothStartLoadPhase =  test_events['BothStartLoadPhase'].to_sparse(fill_value=0)
Test_Labels_LiftOff =  test_events['LiftOff'].to_sparse(fill_value=0)
Test_Labels_Replace =  test_events['Replace'].to_sparse(fill_value=0)
Test_Labels_BothReleased =  test_events['BothReleased'].to_sparse(fill_value=0)


Test_Data = test_data.iloc[0:,1:].astype('float32') # select only data columns

#delete what isn't needed anymore to save memory. 
del train_data
del test_data
del train_events
del test_events
gc.collect() #Garbage collection (i.e. get rid of any outstanding unused memory)

# Define functions for feature extraction

In [None]:

#bin the time 
def Bin_Time(num_rows,num_bins):
    Bin_Size = num_rows/num_bins
    Bins = np.zeros(shape=(num_rows,num_bins))
    Bin_Min = 0
    Bin_Max = Bin_Size
    for i in range(0,num_bins):
        Bins[Bin_Min:Bin_Max,i] = 1
        Bin_Min = Bin_Min + Bin_Size
        Bin_Max = Bin_Max + Bin_Size
    return Bins


##filters borrowed from Nihar. 
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y


#run PCA and return the number of PCs that explain the given amount of variance. 
def extractFeatures_PCA(Train_Data,Test_Data, PercentVarExplained):
    pca = PCA()
    pca.fit(Train_Data)  
    
    Explained_Variance_Ratios = pca.explained_variance_ratio_
    for i in range(1,len(Explained_Variance_Ratios)):
        if sum(Explained_Variance_Ratios[0:i]) >= PercentVarExplained:
                   NumPCs = i + 1 #add 1 since numpy array ranges are not inclusive
                   break
    PCA_Projections = pca.transform(Train_Data)[:,0:NumPCs]
    PCA_Projections_Test = pca.transform(Test_Data)[:,0:NumPCs]
    return np.float32(PCA_Projections),np.float32(PCA_Projections_Test)



#return rolling mean of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_mean(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_mean(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'mean' + str(i)})
    return np.array(new_df.astype('float32'))



#return rolling variance of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_var(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_var(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'var' + str(i)})
    return np.array(new_df.astype('float32'))


#return rolling min of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_min(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_min(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'min' + str(i)})
    return np.array(new_df.astype('float32'))


#return rolling max of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_max(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_max(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'max' + str(i)})
    return np.array(new_df.astype('float32'))



#return rolling skew of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_skew(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_skew(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'skew' + str(i)})
    return np.array(new_df.astype('float32'))



#return rolling skew of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_kurt(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_kurt(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'kurt' + str(i)})
    return np.array(new_df.astype('float32'))



#return rolling quantile of each column in a pandas dataframe with a given window and quantile. Returns df of same size. 
def df_rolling_quantile(df,window,quantile):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        Roll_Array = pd.rolling_quantile(df.iloc[0:,i],window,min_periods = 0).fillna(0)
        list_.append(Roll_Array)
    new_df = pd.concat(list_,1)
    for i in range(0,Num_Cols):
        new_df=new_df.rename(columns = {i:'Pct'+ quantile + "_" + str(i)})
    return np.array(new_df.astype('float32'))


#BE CAREFUL NOT TO SUPPLY TOO MANY COLUMNS TO THIS FUNCTION. Returns 2^N columns, where N = intitial columns. 
#return rolling pairwise correlation of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_corr(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        list_.append(pd.rolling_corr(df.iloc[0:,i],window,min_periods = 0))
    return pd.concat(list_,1)



#BE CAREFUL NOT TO SUPPLY TOO MANY COLUMNS TO THIS FUNCTION. Returns 2^N columns, where N = intitial columns. 
#return rolling pairwise correlation of each column in a pandas dataframe with a given window. Returns df of same size. 
def df_rolling_cov(df,window):
    list_ = []
    Num_Cols = len(df.columns)
    for i in range(0,Num_Cols):
        list_.append(rolling_cov(df.iloc[0:,i],window,min_periods = 0))
    return pd.concat(list_,1)




#Feature Creation 

This section takes rolling statistics for different windows of time for each column of the dataframe and combines them into a single multi-dimensional array. 

In [None]:
Windows = [10,100,500,700]

Test_Features = []
Train_Features = []

Start_Time = time.time()
for w in Windows:
    Raw_Data_Rolling_Window = w
    
    
    RawData_Rolling_Means_Train = df_rolling_mean(pd.DataFrame(Train_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Skewness_Train = df_rolling_skew(pd.DataFrame(Train_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Min_Train = df_rolling_min(pd.DataFrame(Train_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Max_Train = df_rolling_max(pd.DataFrame(Train_Data),Raw_Data_Rolling_Window)


    RawData_Rolling_Means_Test = df_rolling_mean(pd.DataFrame(Test_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Skewness_Test = df_rolling_skew(pd.DataFrame(Test_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Min_Test = df_rolling_min(pd.DataFrame(Test_Data),Raw_Data_Rolling_Window)
    RawData_Rolling_Max_Test = df_rolling_max(pd.DataFrame(Test_Data),Raw_Data_Rolling_Window)

    print "Calculations With Window of", w, "Complete: ", int(time.time()-Start_Time), " Seconds"

    
    #Append all the features into 1
    Train_Features.append(np.concatenate((RawData_Rolling_Means_Train,RawData_Rolling_Skewness_Train,RawData_Rolling_Min_Train,RawData_Rolling_Max_Train),axis = 1))
    Test_Features.append(np.concatenate((RawData_Rolling_Means_Test,RawData_Rolling_Skewness_Test,RawData_Rolling_Min_Test,RawData_Rolling_Max_Test),axis = 1))
    

del RawData_Rolling_Means_Train
del RawData_Rolling_Means_Test

del RawData_Rolling_Skewness_Train
del RawData_Rolling_Skewness_Test

del RawData_Rolling_Min_Train
del RawData_Rolling_Min_Test

del RawData_Rolling_Max_Train
del RawData_Rolling_Max_Test
gc.collect() #Garbage collection (i.e. get rid of any outstanding unused memory)

Test_Features = np.float32(np.concatenate((Test_Features),axis = 1))
gc.collect() 

Train_Features =np.float32( np.concatenate((Train_Features),axis = 1))
gc.collect()

Test_Features = np.float32(np.concatenate((Test_Features,Test_Data),axis = 1))
gc.collect()

Train_Features =np.float32( np.concatenate((Train_Features,Train_Data),axis = 1))

del Train_Data
del Test_Data

gc.collect()

print Train_Features.shape[1], "Total Resultant Features"



## Add additional features here

####Nihar

####Jeff

####Carson

####Ji

#Principal Component Analysis

Reduce the dimensionality of our feature set before modeling begins. 

In [None]:
#set the percent of the total variance explained we want explained by our subset of principal components. 
Pct_Variance_Explained = .93


Start_Time = time.time() # begin timer

# we must first scale the data. 
Scale_Center = StandardScaler()
Z_Train_Features = np.float16(Scale_Center.fit_transform(np.array(Train_Features)))
del Train_Features 
gc.collect()  #Garbage collection (i.e. get rid of any outstanding unused memory)


Z_Test_Features = np.float16(Scale_Center.fit_transform(np.array(Test_Features)))
del Test_Features 
gc.collect()


PCs_Train,PCs_Test = extractFeatures_PCA(Z_Train_Features,Z_Test_Features,Pct_Variance_Explained)
del Z_Train_Features,Z_Test_Features 
gc.collect()
print "PCA Complete:", int(time.time()-Start_Time), " Seconds"

print ''
print PCs_Train.shape[1], "Resultant Principal Components"

# Test Different Models

##Logistic Regession

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#set parameters for Logistic Regression
#Setting them here will allow the optimization of the parameters. We saw in project 2 how much the C-value can affect the results. 
C_Value = 1
penalty = 'l2'
Convergence_tol = .001

#Create a function that trains and runs a logistic regression model. Then prints and returns the AUC score
def predictCategory_LogReg(train_data, train_label, test_data, test_label, Category, C_Value,penalty, Convergence_tol):
    Logistic_Reg = LogisticRegression(C = C_Value, penalty = penalty,tol=Convergence_tol)    
    Logistic_Reg.fit(train_data, train_label)
    print(Category, " Score =", Logistic_Reg.score(test_data, test_label))

    prob = Logistic_Reg.predict_proba(test_data)
    print(Category, "AUC", roc_auc_score(test_label, prob[:,1]))

    #FPR, TPR, thresholds = roc_curve(Test_Labels_HandStart, prob[:,1])
    return roc_auc_score(test_label, prob[:,1])


Start_Time = time.time()   
AUC_HandStart = predictCategory_LogReg(PCs_Train,Train_Labels_HandStart.to_dense(),PCs_Test,Test_Labels_HandStart.to_dense(), 'HandStart', C_Value,penalty, Convergence_tol)
AUC_FirstDigitTouch = predictCategory_LogReg(PCs_Train,Train_Labels_FirstDigitTouch.to_dense(),PCs_Test,Test_Labels_FirstDigitTouch.to_dense(), 'FirstDigitTouch', C_Value,penalty, Convergence_tol)
AUC_BothStartLoadPhase = predictCategory_LogReg(PCs_Train,Train_Labels_BothStartLoadPhase.to_dense(),PCs_Test,Test_Labels_BothStartLoadPhase.to_dense(), 'BothStartLoadPhase', C_Value,penalty, Convergence_tol)
AUC_LiftOff = predictCategory_LogReg(PCs_Train,Train_Labels_LiftOff.to_dense(),PCs_Test,Test_Labels_LiftOff.to_dense(), 'LiftOff', C_Value,penalty, Convergence_tol)
AUC_Replace = predictCategory_LogReg(PCs_Train,Train_Labels_Replace.to_dense(),PCs_Test,Test_Labels_Replace.to_dense(), 'Replace', C_Value,penalty, Convergence_tol)
AUC_BothReleased = predictCategory_LogReg(PCs_Train,Train_Labels_BothReleased.to_dense(),PCs_Test,Test_Labels_BothReleased.to_dense(), 'BothReleased', C_Value,penalty, Convergence_tol)
print int(time.time()-Start_Time), " Seconds to complete"
print "Overall Logistic Regression Score = ", np.mean((AUC_HandStart, AUC_FirstDigitTouch,AUC_BothStartLoadPhase,AUC_LiftOff,AUC_Replace,AUC_BothReleased))

## SVM Model

This is set up to get started. It has not been tested. MM 08/04/2015

In [None]:
# #set parameters for SVM
# #Setting them here will allow the optimization of the parameters. We saw in project 2 how much the C-value can affect the results. 
# C_Value = 1
# kernel = 'rbf'
# degree = 3

# #Create a function that trains and runs a model. Then prints and returns the AUC score
# def predictCategory_SVM(train_data, train_label, test_data, test_label, Category, C_Value,kernel, degree):
#     SVM = SVC(C = C_Value, kernel = kernel,degree=degree)    
#     SVC.fit(train_data, train_label)

#     prob = SVC.predict_proba(test_data)
#     print(Category, "AUC", roc_auc_score(test_label, prob[:,1]))

#     #FPR, TPR, thresholds = roc_curve(Test_Labels_HandStart, prob[:,1])
#     return roc_auc_score(test_label, prob[:,1])
    
# AUC_HandStart = predictCategory_SVM(Feature_Array_Train,Train_Labels_HandStart,Feature_Array_Test,Test_Labels_HandStart, 'HandStart', Category, C_Value,kernel, degree)
# AUC_FirstDigitTouch = predictCategory_SVM(Feature_Array_Train,Train_Labels_FirstDigitTouch,Feature_Array_Test,Test_Labels_FirstDigitTouch, 'FirstDigitTouch', Category, C_Value,kernel, degree)
# AUC_BothStartLoadPhase = predictCategory_SVM(Feature_Array_Train,Train_Labels_BothStartLoadPhase,Feature_Array_Test,Test_Labels_BothStartLoadPhase, 'BothStartLoadPhase', Category, C_Value,kernel, degree)
# AUC_LiftOff = predictCategory_SVM(Feature_Array_Train,Train_Labels_LiftOff,Feature_Array_Test,Test_Labels_LiftOff, 'LiftOff', Category, C_Value,kernel, degree)
# AUC_Replace = predictCategory_SVM(Feature_Array_Train,Train_Labels_Replace,Feature_Array_Test,Test_Labels_Replace, 'Replace', Category, C_Value,kernel, degree)
# AUC_BothReleased = predictCategory_SVM(Feature_Array_Train,Train_Labels_BothReleased,Feature_Array_Test,Test_Labels_BothReleased, 'BothReleased', Category, C_Value,kernel, degree


# print "Overall SVM Score = ", np.mean(AUC_HandStart, AUC_FirstDigitTouch,AUC_BothStartLoadPhase,AUC_LiftOff,AUC_Replace,AUC_BothReleased)

##Neural Network

This is set up to get started. It has not been tested. MM 08/04/2015

In [None]:
# #set parameters for Neural Network model
# #Setting them here will allow the optimization of the parameters. We saw in project 2 how much the C-value can affect the results. 
# n_components = 10
# learning_rate = .1
# batch_size = 20000
# n_iter =1


# #Create a function that trains and runs a model. Then prints and returns the AUC score
# def predictCategory_Neural_Network(train_data, train_label, test_data, test_label, Category, C_Value,kernel, degree):
#     SVM = SVC(C = C_Value, kernel = kernel,degree=degree)    
#     SVC.fit(train_data, train_label)

#     prob = SVC.predict_proba(test_data)
#     print(Category, "AUC", roc_auc_score(test_label, prob[:,1]))

#     #FPR, TPR, thresholds = roc_curve(Test_Labels_HandStart, prob[:,1])
#     return roc_auc_score(test_label, prob[:,1])
    
# AUC_HandStart = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_HandStart,Feature_Array_Test,Test_Labels_HandStart, 'HandStart', Category, C_Value,kernel, degree)
# AUC_FirstDigitTouch = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_FirstDigitTouch,Feature_Array_Test,Test_Labels_FirstDigitTouch, 'FirstDigitTouch', Category, C_Value,kernel, degree)
# AUC_BothStartLoadPhase = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_BothStartLoadPhase,Feature_Array_Test,Test_Labels_BothStartLoadPhase, 'BothStartLoadPhase', Category, C_Value,kernel, degree)
# AUC_LiftOff = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_LiftOff,Feature_Array_Test,Test_Labels_LiftOff, 'LiftOff', Category, C_Value,kernel, degree)
# AUC_Replace = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_Replace,Feature_Array_Test,Test_Labels_Replace, 'Replace', Category, C_Value,kernel, degree)
# AUC_BothReleased = predictCategory_Neural_Network(Feature_Array_Train,Train_Labels_BothReleased,Feature_Array_Test,Test_Labels_BothReleased, 'BothReleased', Category, C_Value,kernel, degree


# print "Overall Neural Network Score = ", np.mean(AUC_HandStart, AUC_FirstDigitTouch,AUC_BothStartLoadPhase,AUC_LiftOff,AUC_Replace,AUC_BothReleased)


## Decision Trees and Random Forest

In [None]:
print int(time.time()-Overall_Start_Time), " Seconds to run all code"