In [159]:
# import libraries
import pandas as pd
import numpy as np
import sklearn as sk
import random
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [167]:
# declare utility functions
TARGET_COL = 622
FEATURE = 0
TARGET = 1

def get_csv(path):
    data = pd.read_csv(filepath_or_buffer=path, header=None)
    # all train data
    X = data.iloc[:,4:-1]
    # all test data
    Y = data.iloc[:, -1:][TARGET_COL]
    return (X, Y, data)

def part_list(lst, n):
    """
        part_list: Partition lst balanced parts
        in: 
            lst - list that needs to be partitioned
            n - integer number of partitions
        out:
            partitioned list
    """
    parts, rest = divmod(len(lst), n)
    lstiter = iter(lst)
    for j in xrange(n):
        plen = len(lst)/n + (1 if rest > 0 else 0)
        rest -= 1
        yield list(itertools.islice(lstiter, plen))

def build_group_df(data, patients):
    """
        build_group_df: helper for build_cross_validation_sets
        in: 
            data - RAW data
            patients - list of patient ids
        out:
            df with concatenated pixel data relevant to each patient in patients
    """
    return pd.concat([data[data[0] == patient] for patient in patients], ignore_index=True)

def build_cross_validation_sets(data, k):
    """
        build_cross_validation_sets: helper for cross_validate
        in:
            data: RAW data
            k - desire number of groups
        out:
            list of tuples: (feature_df, target_series)
    """
    # manifest constants, get unique patients, and random shuffle
    unique_patients = data[0].unique().tolist()
    random.shuffle(unique_patients)

    #create k groups
    k_groups = list(part_list(unique_patients, k))
    
    # [df1, df2, df3, df4] with each dfi repersenting the ith group in k total groups
    k_df = [build_group_df(data, group) for group in k_groups]
    # (features, target) for each df
    k_df_split = [(data.iloc[:,3:-1], data.iloc[:, -1:][TARGET_COL]) for data in k_df]
    
    return k_df_split
    
def cross_validate(model, data, k = 5):
    """
        cross_validate: performs cross validation
        in:
            model - input model
            data - RAW data
            k - desired number of groups
        out:
            (mean of scores, list of scores)
    """
    # manifest constants
    score_list = []
    
    # get split data
    k_df_split = build_cross_validation_sets(data, k)
    
    for (i, (X, y)) in enumerate(k_df_split):
        # get all dfs not k
        non_kth_group = k_df_split[:]
        del non_kth_group[i]
        
        # build x and y train data
        X_train = pd.concat([data[FEATURE] for data in non_kth_group])
        y_train = pd.concat([data[TARGET] for data in non_kth_group])
        
        # build x and y test data
        X_test = X
        y_test = y
        
        # train model on non_kth_group
        model.fit(X_train, y_train)
        
        # test model on kth group
        score = model.score(X_test, y_test)
        
        # add score to score list
        score_list.append(score)
        
    return (np.mean(score_list), score_list)

#data.sort_values(0)
#build_cross_validation_sets(data, 5)
cross_validate(LRmodel, data, 5)

(0.75556997526008052,
 [0.74952865761689291,
  0.7935023134178234,
  0.78410898965791564,
  0.6877054569362262,
  0.76300445867154454])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,613,614,615,616,617,618,619,620,621,622
0,528,158,112,23,0.15038,0.14633,0.14252,0.15382,0.14866,0.13823,...,0.14905,0.14827,0.14062,0.13338,0.13431,0.13813,0.13907,0.13445,0.12464,1
1,367,89,173,20,0.2125,0.19726,0.19421,0.2152,0.20417,0.20218,...,0.12359,0.12034,0.11629,0.11444,0.11111,0.11049,0.10893,0.10631,0.10478,1
2,283,190,55,17,0.027484,0.03619,0.046603,0.019973,0.032946,0.047286,...,0.11417,0.11166,0.11141,0.11181,0.11096,0.1099,0.10913,0.10831,0.10667,0
3,424,166,90,29,0.17421,0.17088,0.17446,0.19096,0.18238,0.17638,...,0.10908,0.10319,0.096503,0.091461,0.088297,0.085691,0.084188,0.082517,0.081374,1
4,18,221,249,25,0.055317,0.058417,0.059609,0.042918,0.049833,0.056509,...,0.020098,0.020716,0.021334,0.021952,0.02257,0.023188,0.023806,0.024424,0.025042,1


In [107]:
# use path to get data
path = "/home/carter/Documents/Brain-Lesion-Predictive-Model/Data/MRI-DATA/train_data.csv"
(X, Y, data) = get_csv(path)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [75]:
# Model creation
LRmodel = LogisticRegression()
LRmodel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [76]:
LRmodel.score(X_test, y_test)

0.82704

In [89]:
score = cross_val_score(LRmodel, X, Y, cv=5)

In [78]:
score.mean()

0.82015982864637249

In [86]:
# Model creation
MLPmodel = MLPClassifier()
MLPmodel.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [87]:
MLPmodel.score(X_test, y_test)

0.90688000000000002

In [88]:
MLP_score = cross_val_score(MLPmodel, X, Y, cv=5)
MLP_score.mean()

0.87937718495537198