I've included functions here to merge the test csv files into:

- test_data.csv

I've also included a script that separates the id field into subject, series, and frame number.

Using pandas, these operations take minutes each, speed isn't an issue. However, memory is an issue. The test dataset is fine to use with all functions, but the training dataset is too large to merge.

Import lines

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import datetime
import random

# scipy
from scipy.signal import butter, lfilter

# SK-learn libraries.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV


Define a function for merging csv files in a folder.

In [8]:
# The "search_sample" parameter accepts a template
# string to look for in csvpath. For example, 
# *.csv would merge all .csv files found in
# csvpath.

def csv_merger(csvpath, search_sample, newcsv):
    # Find all of the relevant files.
    filenames = glob.glob(csvpath + search_sample)
    
    # Prepare an empty dataframe.
    print datetime.datetime.now(), "Merging..."
    new_dataframe = pd.DataFrame()

    # Read each file into a pandas dataframe, and
    # append it to the initial dataframe.
    for file_ in filenames:
        print datetime.datetime.now(), file_
        new_dataframe = new_dataframe.append(pd.read_csv(file_,index_col=None, header=0))

    # Save the results to a csv.
    new_dataframe.to_csv(newcsv, index=False)

    print datetime.datetime.now(), "Merge Complete"

Merge the test data into a single csv. Requires the following folder structure:
  data\ test\ subj#_series$_data.csv

In [7]:
path = r'data\test'
searchstring = "/*data.csv"
newcsvname = 'test_data.csv'

csv_merger(path, searchstring, newcsvname)

2015-07-25 09:48:15.784000 Merging...
2015-07-25 09:48:15.790000 data\test\subj10_series10_data.csv
2015-07-25 09:48:16.194000 data\test\subj10_series9_data.csv
2015-07-25 09:48:16.609000 data\test\subj11_series10_data.csv
2015-07-25 09:48:17.075000 data\test\subj11_series9_data.csv
2015-07-25 09:48:17.547000 data\test\subj12_series10_data.csv
2015-07-25 09:48:18.041000 data\test\subj12_series9_data.csv
2015-07-25 09:48:18.566000 data\test\subj1_series10_data.csv
2015-07-25 09:48:19.004000 data\test\subj1_series9_data.csv
2015-07-25 09:48:19.487000 data\test\subj2_series10_data.csv
2015-07-25 09:48:20.060000 data\test\subj2_series9_data.csv
2015-07-25 09:48:20.625000 data\test\subj3_series10_data.csv
2015-07-25 09:48:21.139000 data\test\subj3_series9_data.csv
2015-07-25 09:48:21.682000 data\test\subj4_series10_data.csv
2015-07-25 09:48:22.231000 data\test\subj4_series9_data.csv
2015-07-25 09:48:22.771000 data\test\subj5_series10_data.csv
2015-07-25 09:48:23.352000 data\test\subj5_serie

In [1]:
# Merge the training data only if you have at least 16 GB of memory.
# I'm not sure if that's enough still.

# path = r'data\train'
# searchstring = "/*data.csv"
# newcsvname = 'train_data.csv'
# csv_merger(path, searchstring, newcsvname)

# print "\n-----------------\n"

# Next, merge the training labels only if you have at least 16 GB of memory.
# I'm not sure if that's enough still.

# path = r'data\train'
# searchstring = "/*events.csv"
# newcsvname = 'train_labels.csv'
# csv_merger(path, searchstring, newcsvname)

Use the split_id function to split the id field into separate fields for subject, session, and frame number.

In [2]:
# This function will read a csv or a dataframe,
# and either return a dataframe or save a new csv
# depending on which parameters you use. For example,
# if you enter a csvpath but leave all other parameters
# blank, it will read that csv and return a pandas
# dataframe.

def split_id(original_frame=pd.DataFrame(), csvpath='none', newcsv='none'):
    
    # If a csvpath is not provided, read the provided dataframe directly.
    # Otherwise, read the csv into a pandas dataframe.
    if csvpath == 'none':
        originalframe = original_frame
    else:
        originalframe = pd.read_csv(csvpath, header=0)
    print "\noriginal columns: ", originalframe.columns.values

    # Create a new dataframe, separating out the subject, session, and frame number 
    # from the id column.
    newframe = pd.DataFrame(originalframe.id.str.split('_').tolist())
    print "\nnew column numbers: ", newframe.columns.values

    # Rename the columns.
    newframe = newframe.rename(columns={0: 'subject', 1: 'series', 2: 'frame'})
    print "\nnew columns named: ", newframe.columns.values

    # Merge the new frame with the original one.
    newframe = pd.concat([newframe, originalframe], axis=1)
    print "\nnew frame columns: ", newframe.columns.values
    
    # If a new csv file name wasn't specified, return the dataframe.
    # Otherwise, save the results as a csv.
    if newcsv=='none':
        return newframe
    else:
        newframe.to_csv(newcsv, index=False)
        print "\nComplete!"

# csvpath = 'data/train/subj1_series1_data.csv'
# newcsv = 'new_results.csv'
# split_id(csvpath=csvpath, newcsv=newcsv)
# print split_id(csvpath=csvpath).head()


Use subject 1 series 1 and subject 1 series 2 for training. Use subject 1 series 3 for development. Develop your own features. Use logistic regression as your model. We will then compare results and see which features work well.

I'm experimenting with this part. -Nihar

In [3]:
# Bandpass filter------------------------------------------
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

# Pre-process the training and development data.
# Then save to CSV. ----------------------------------------

# Load subject 1 series 1 and subject 1 series 2 for training.
train_data_raw = pd.concat([pd.read_csv("data/train/subj1_series1_data.csv", header=0),
                       pd.read_csv("data/train/subj1_series2_data.csv", header=0)])

# Load subject 1 series 3 for development.
dev_data_raw = pd.read_csv("data/train/subj1_series3_data.csv", header=0)

# Create a training dataframe.
train_data=pd.DataFrame(train_data_raw['id'])

# Process each column and add it to the training dataframe.
for column in train_data_raw.columns.values[1:-1]:
    print "Processing", column
    train_data[column] = pd.Series(butter_bandpass_filter(train_data_raw[column], 8, 12, 500), index=train_data.index)


# Create a development dataframe.
dev_data=pd.DataFrame(dev_data_raw['id'])

# Process each column and add it to the dev dataframe.
for column in dev_data_raw.columns.values[1:-1]:
    print "Processing", column
    dev_data[column] = pd.Series(butter_bandpass_filter(dev_data_raw[column], 8, 12, 500), index=dev_data.index)


#Check and then save the results.    
print train_data.head()
train_data.to_csv('train_data.csv',index=False)
dev_data.to_csv('dev_data.csv',index=False)

print "complete!"

Processing Fp1
Processing Fp2
Processing F7
Processing F3
Processing Fz
Processing F4
Processing F8
Processing FC5
Processing FC1
Processing FC2
Processing FC6
Processing T7
Processing C3
Processing Cz
Processing C4
Processing T8
Processing TP9
Processing CP5
Processing CP1
Processing CP2
Processing CP6
Processing TP10
Processing P7
Processing P3
Processing Pz
Processing P4
Processing P8
Processing PO9
Processing O1
Processing Oz
Processing O2
Processing Fp1
Processing Fp2
Processing F7
Processing F3
Processing Fz
Processing F4
Processing F8
Processing FC5
Processing FC1
Processing FC2
Processing FC6
Processing T7
Processing C3
Processing Cz
Processing C4
Processing T8
Processing TP9
Processing CP5
Processing CP1
Processing CP2
Processing CP6
Processing TP10
Processing P7
Processing P3
Processing Pz
Processing P4
Processing P8
Processing PO9
Processing O1
Processing Oz
Processing O2
                id       Fp1       Fp2        F7        F3        Fz  \
0  subj1_series1_0 -0.000000  0.

In [14]:
# Load subject 1 series 1 and subject 1 series 2 for training.
train_data = pd.read_csv("train_data.csv", index_col=0, header=0)

train_labels = pd.concat([pd.read_csv("data/train/subj1_series1_events.csv", header=0),
                         pd.read_csv("data/train/subj1_series2_events.csv", header=0)])
print train_labels.columns.values

# Note that I use index_col=0 to force the id line out of the dataframe (it becomes the index).
# Otherwise, LogisticRegression() would try to use it as a variable and fail. The better
# solution would be to subset it without the id field, but I had trouble with that and was
# short on time. - Nihar


# Train a seperate model for each category.
categories = train_labels.columns.values[1:-1]
models = []
for category in categories:
    print "Training model for", category
    models.append(LogisticRegression(penalty='l2'))
    models[-1].fit(train_data, train_labels[category])
    print " Coefficients:", models[-1].coef_

# Empty memory
train_data = pd.DataFrame()
train_labels = pd.DataFrame()
    
# Load the development set to check accuracy.
dev_data = pd.read_csv("dev_data.csv", index_col=0, header=0)
dev_labels = pd.read_csv("data/train/subj1_series3_events.csv", header=0)
print dev_labels.columns.values

print dev_data.shape
print dev_labels.shape

# Score the model with the development data.
print "\nPredictions--------\n"
categories = dev_labels.columns.values[1:-1]
for model, category in zip(models, categories):
    print "\n", category
    print " Coefficients:", model.coef_
    print " Accuracy=", model.score(dev_data, dev_labels[category])

print "Complete!"

['id' 'HandStart' 'FirstDigitTouch' 'BothStartLoadPhase' 'LiftOff'
 'Replace' 'BothReleased']
Training model for HandStart
 Coefficients: [[ 0.00039294 -0.00027013 -0.00052135  0.00143955 -0.00265022  0.001553
   0.00047171 -0.00144589 -0.00052323 -0.00568038  0.0012132   0.00238694
   0.00120376  0.00652494  0.0028678   0.00107466 -0.00196816  0.00045018
   0.00021917 -0.00539265 -0.00249501 -0.00210144 -0.00035245  0.00069852
  -0.00045426  0.00107202  0.00240749  0.00155399 -0.00229597  0.00235885
  -0.0018295 ]]
Training model for FirstDigitTouch
 Coefficients: [[ -6.80788281e-04   2.89367813e-04  -7.91349291e-04  -1.29861660e-03
    2.02978102e-03   1.90340423e-03   3.84414983e-04  -1.63285819e-03
    5.85165889e-03   6.23963246e-04  -1.49376257e-03   2.12158594e-03
    1.07951551e-05   1.46881238e-03  -6.34171264e-04  -5.57913111e-04
    2.56858502e-04   1.26008967e-03  -2.35707710e-03   4.54412238e-03
   -1.02045342e-03  -3.03089510e-04   1.69975635e-04  -2.99317310e-03
   -8.36

In [2]:
# Load a few random training data files into a pandas dataframe.
path =r'data\train'
train_data_filenames, train_labels_filenames = glob.glob(path + "/*data.csv"), glob.glob(path + "/*events.csv")
list1_ = []
list2_ = []
start = time.time()
print "Loading training files..."


for i in random.sample(range(len(train_data_filenames)), 2):
    df1 = pd.read_csv(train_data_filenames[i], index_col=0, header=0)
    df2 = pd.read_csv(train_labels_filenames[i], index_col=0, header=0)
    list1_.append(df1)
    list2_.append(df2)


# pd.concat(list1_).to_csv('train_data.csv', index=False)
# pd.concat(list2_).to_csv('train_labels.csv', index=False)
# train_data = np.loadtxt(open('train_data.csv','rb'), delimiter=",",skiprows=1)
# train_labels = np.loadtxt(open('train_labels.csv','rb'), delimiter = ",", skiprows=1)

train_data = pd.concat(list1_)[1:][range(32)].values.astype(float)
train_labels = pd.concat(list2_)[1:][range(6)].values.astype(float)

#train_data = pd.read_csv()
#(list1_)[1:][range(32)].values.astype(float)
#train_labels = pd.concat(list2_)[1:][range(6)].values.astype(float)

print "Training model..."
# Trying KNN model to start
kborhood = KNeighborsClassifier(n_neighbors=5)
kborhood.fit(train_data, train_labels)
#gmnb = MultinomialNB().fit(train_data, train_labels)

print "Loading test data..."
# Load the test data
# Will need to change index_col to 0 instead of 1 after I re-create the test data csv. 
test_data = pd.read_csv('test_data.csv', index_col=1, header=0)[1:][range(32)].values.astype(float)

print "Making predictions..."
# Make predictions
predictions = pd.DataFrame(kborhood.predict(test_data[:]))

# Save results
predictions.to_csv('results.csv', index=False)
print "Complete! Script took", (time.time() - start), "seconds"

Loading training files...
Training model...
Loading test data...
Making predictions...
Complete! Script took 1292.69400001 seconds
