In [44]:
import theano
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils.np_utils import to_categorical
from keras import backend

import pandas as pd
import numpy as np
import random
import itertools
import h5py

In [89]:
# Settings for KerasWrapper Class
keras_settings = {}
keras_settings['epochs'] = 20
keras_settings['batch_size'] = 100
keras_settings['verbose'] = 0
keras_settings['validation_split'] = 0.0
keras_settings['validation_data'] = None
keras_settings['shuffle'] = True

keras_settings['optimizer'] = 'adadelta'
keras_settings['loss'] = 'binary_crossentropy'
keras_settings['metrics'] = ['accuracy']

# Wrapper class for Sequential Neural Network
class KerasSequential(Sequential):
    # settings contains several variables for the fit and score functions
    def __init__(self, settings = None, layers=None, name=None):
        Sequential.__init__(self, layers, name)
        self.settings = settings
    
    # Fits sequential model to test batch
    def fit(self, X, y):
        Sequential.compile(self,
                           optimizer = self.settings['optimizer'],
                           loss = self.settings['loss'],
                           metrics = self.settings['metrics'])
        return Sequential.fit(self, np.array(X), y,
                 epochs = self.settings['epochs'],
                 batch_size = self.settings['batch_size'],
                 verbose = self.settings['verbose'],
                 validation_split = self.settings['validation_split'],
                 validation_data = self.settings['validation_data'],
                 shuffle = self.settings['shuffle'])
    
    # Returns the accuracy of the test batch
    def score(self, X_test, y_test):
        score =  Sequential.evaluate(self, np.array(X_test), y_test,
                               batch_size = self.settings['batch_size'],
                               verbose = self.settings['verbose'])
        return score[1] if self.settings['metrics'] != None else score
    
    # Reset
    def reset(self):
        return Sequential.reset_states(self)

In [46]:
# declare utility functions
TARGET_COL = 622
FEATURE = 0
TARGET = 1

def get_csv(path):
    data = pd.read_csv(filepath_or_buffer=path, header=None)
    # all train data
    X = data.iloc[:,4:-1]
    # all test data
    Y = data.iloc[:, -1:][TARGET_COL]
    return (X, Y, data)

def part_list(lst, n):
    """
        part_list: Partition lst balanced parts
        in: 
            lst - list that needs to be partitioned
            n - integer number of partitions
        out:
            partitioned list
    """
    parts, rest = divmod(len(lst), n)
    lstiter = iter(lst)
    for j in xrange(n):
        plen = len(lst)/n + (1 if rest > 0 else 0)
        rest -= 1
        yield list(itertools.islice(lstiter, plen))

def build_group_df(data, patients):
    """
        build_group_df: helper for build_cross_validation_sets
        in: 
            data - RAW data
            patients - list of patient ids
        out:
            df with concatenated pixel data relevant to each patient in patients
    """
    return pd.concat([data[data[0] == patient] for patient in patients], ignore_index=True)

def build_cross_validation_sets(data, k):
    """
        build_cross_validation_sets: helper for cross_validate
        in:
            data: RAW data
            k - desire number of groups
        out:
            list of tuples: (feature_df, target_series)
    """
    # manifest constants, get unique patients, and random shuffle
    unique_patients = data[0].unique().tolist()
    random.shuffle(unique_patients)

    #create k groups
    k_groups = list(part_list(unique_patients, k))
    
    # [df1, df2, df3, df4] with each dfi repersenting the ith group in k total groups
    k_df = [build_group_df(data, group) for group in k_groups]
    # (features, target) for each df
    k_df_split = [(data.iloc[:,4:-1], data.iloc[:, -1:][TARGET_COL]) for data in k_df]
    
    return k_df_split
    
def cross_validate(model, data, k = 5):
    """
        cross_validate: performs cross validation
        in:
            model - input model
            data - RAW data
            k - desired number of groups
        out:
            (mean of scores, list of scores)
    """
    # manifest constants
    score_list = []
    
    # get split data
    k_df_split = build_cross_validation_sets(data, k)
    
    for (i, (X, y)) in enumerate(k_df_split):
        # get all dfs not k
        non_kth_group = k_df_split[:]
        del non_kth_group[i]
        
        # build x and y train data
        X_train = pd.concat([data[FEATURE] for data in non_kth_group])
        y_train = pd.concat([data[TARGET] for data in non_kth_group])
        
        # build x and y test data
        X_test = X
        y_test = y
        
        # train model on non_kth_group
        model.fit(X_train, y_train)
        
        # test model on kth group
        score = model.score(X_test, y_test)
        
        # add score to score list
        score_list.append(score)
        
    return (np.mean(score_list), score_list)

#data.sort_values(0)
#build_cross_validation_sets(data, 5)
#cross_validate(LRmodel, data, 5)

In [51]:
# Load Data
X, y, data = get_csv('train_data.csv')

# Split data into random 80/20 train / test split
indices = np.arange(len(X))
np.random.shuffle(indices)
cut = int(len(X)*0.8)
X_train = X.iloc[indices[:cut],:]
y_train = y[indices[:cut]]
X_test = X.iloc[indices[cut:],:]
y_test = y[indices[cut:]]

In [90]:
# Different models to test
'''
model1 - 2x Dense
model2 - 2x Dense, Dense Dropout
model3 - 2x Dense, Dense Dropout, Input Dropout
model4 - 4x Dense
model5 - 4x Dense, Dense Dropout
model5 - 4x Dense, Dense Dropout, Input Dropout
'''
models = {}
models['model1'] = KerasSequential(keras_settings,
                       [Dense(200, activation='relu', input_dim=618),
                        Dense(200, activation='relu'),
                        Dense(1, activation='sigmoid')
                       ])
models['model2'] = KerasSequential(keras_settings,
                       [Dense(200, activation='relu', input_dim=618),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(1, activation='sigmoid')
                       ])
models['model3'] = model = KerasSequential(keras_settings,
                       [Dropout(0.2, input_shape=(618,)),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(1, activation='sigmoid')
                       ])
models['model4'] = KerasSequential(keras_settings,
                       [Dense(200, activation='relu', input_dim=618),
                        Dense(200, activation='relu'),
                        Dense(200, activation='relu'),
                        Dense(200, activation='relu'),
                        Dense(1, activation='sigmoid')
                       ])
models['model5'] = KerasSequential(keras_settings,
                       [Dense(200, activation='relu', input_dim=618),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(1, activation='sigmoid')
                       ])
models['model6'] = KerasSequential(keras_settings,
                       [Dropout(0.2, input_shape=(618,)),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(200, activation='relu'),
                        Dropout(0.5),
                        Dense(1, activation='sigmoid')
                       ])

In [None]:
# Test all variations of neural network model
for model_name in models:
    print(model_name)
    m = models[model_name]
    
    # Cross validation accuracy
    print("\tCross validation accuracy:\t{:0.4f}".format(cross_validate(m, data, 5)[0]))
    
    m.reset()
    
    # 80/20 split accuracy
    m.fit(X_train, y_train)
    score = m.score(X_test, y_test)
    print("\tRandom split accuracy:\t\t{:0.4f}".format(score))

model3
	Cross validation accuracy:	0.8239
	Random split accuracy:		0.8585
model2
	Cross validation accuracy:	0.7924
	Random split accuracy:		0.9010
model1
	Cross validation accuracy:	0.8299
	Random split accuracy:		0.9530
model6


In [None]:
# Best variation of the network is the 4th model (4x dense layer, no dropout)
# Save neural network layer weights
m = models['model4']
m.reset()
cross_validate(m, data, 5)[0]
m.save('keras_classifier.h5')