In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold
from sklearn import svm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.neighbors import KNeighborsClassifier as KNN


%matplotlib inline

In [2]:
#Load the data

x = np.loadtxt('datasets/train_predictors.txt', delimiter = ',')

y = np.loadtxt('datasets/train_labels.txt', delimiter = ',')

x_final_test = np.loadtxt('datasets/test_predictors.txt', delimiter = ',')

In [25]:
# separate cases and controls in the training set
# Concatenate x and y
y = y.reshape(len(y),1)
data = np.concatenate((x,y),axis =1)
cases = data[data[:, 102]==1]
# cases.shape
controls = data[data[:, 102]==0]
# controls.shape

# permute controls, and choose same number of controls as cases 
n_control = controls.shape[0]
perm = np.random.randint(low=0, high=n_control, size=n_control) # Generate a list of indices 0 to n and permute it    
controls_perm = controls[perm]
controls_subsample = controls_perm[:cases.shape[0]] 

# Form balanced training set with equal cases and controls
train = np.concatenate((controls_subsample, cases), axis = 0)
# Separate predictors and labels
x_train = train[:, :-1]
y_train = train[:, -1]

In [44]:
#Load models
models = np.load('datasets/models.npy')
models

array([ LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001),
       SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=

In [13]:
new_x_train = np.zeros([len(x_train), len(models)])
new_x_test = np.zeros([len(x_test), len(models)])
new_x_final_test = np.zeros([len(x_final_test), len(models)])

for i in range(10):
    models[i].fit(x_train, y_train)
    new_x_train[:, i] = models[i].predict(x_train)
    new_x_test[:, i] = models[i].predict(x_test)
    print i

# Fit SVM model with C = 1000

model_svm = svm.SVC(C=1000, kernel='linear')
model_svm.fit(new_x_train, y_train)   
y_pred_svm = model_svm.predict(new_x_test)

np.mean(y_pred_svm == y_test)

0.99521390177453795

In [29]:
new_x = np.zeros([len(x_train), len(models)])
new_x_final_test = np.zeros([len(x_final_test), len(models)])

In [39]:
#0,2,4,5,6,7,8,9
i=1
models[i].fit(x_train, y_train)
new_x[:, i] = models[i].predict(x_train)
new_x_final_test[:, i] = models[i].predict(x_final_test)

In [16]:
new_x[:, 3] = new_x[:, 1]
new_x_final_test[:, 3] = new_x_final_test[:, 1]

In [40]:
# Fit SVM model with C = 1000

model_svm = svm.SVC(C=1000)

model_svm.fit(new_x, y_train) 

y_pred_svm = model_svm.predict(new_x_final_test)


hi
hi


In [41]:
len(y_pred_svm)

33149

In [42]:
feature_list = ['label']
SVM1 = pd.DataFrame(y_pred_svm.astype(int), columns=feature_list)
SVM1.index = range(1, len(y_pred_svm)+1)
SVM1.to_csv("SVM2.csv", sep=',')

In [None]:
# Size of the data set
n = x.shape[0]

# No. of subsamples
num_samples = 20
for i in range(num_samples):

    # Generate a random subsample of data points
    # This is the bootstrap
    perm = np.random.randint(low=0, high=n, size=n) # Generate a list of indices 0 to n and permute it    
    x_subsample = x[perm]
    y_subsample = y[perm] 
    
    # Split the dataset: first 20% into training and remaining for validation
#     cutoff = round(n*0.2)
    cutoff = 10000
    x_train = x_subsample[:cutoff, :]
    y_train = y_subsample[:cutoff]
    x_valid = x_subsample[cutoff:, :]
    y_valid = y_subsample[cutoff:]

In [None]:
#### STANDARDIZATION OF PREDICTORS ####
# Check for 0 standard deviation
# If std = 0, change it to 1 to avoid NaN in standardization
x_std = x.std(axis=0)
x_std[x_std == 0]=1

# Standardize parameters
x = (x - x.mean(axis=0))/x_std
x_test = (x_test - x.mean(axis=0))/x_std

In [None]:
############### MODELING AND ANALYSIS ###############

# Size of the data set
n = x.shape[0]

# No. of subsamples
num_samples = 20

#Set range of penalty parameter lambda
lambda_min = -7
lambda_max = 7

num_lambdas = 15
num_predictors = x.shape[1]

lambdas= np.linspace(lambda_min,lambda_max, num_lambdas)

#Create empty arrays to store accuracy of train, validation, and average accuracy from bootstrapping
accuracy_train = np.zeros([num_samples, num_lambdas])
accuracy_valid = np.zeros([num_samples, num_lambdas])


for i in range(num_samples):

    # Generate a random subsample of data points
    # This is the bootstrap
    perm = np.random.randint(low=0, high=n, size=n) # Generate a list of indices 0 to n and permute it    
    x_subsample = x[perm]
    y_subsample = y[perm] 
    
    # Split the dataset: first 20% into training and remaining for validation
#     cutoff = round(n*0.2)
    cutoff = 10000
    x_train = x_subsample[:cutoff, :]
    y_train = y_subsample[:cutoff]
    x_valid = x_subsample[cutoff:, :]
    y_valid = y_subsample[cutoff:]



     # Fit logistic regression with varying lambda (lasso penalty) on train set
    for j in range(lambda_min, lambda_max+1):
        reg=LogReg(C=10**j, penalty='l1')
        reg.fit(x_train, y_train)
        accuracy_train[i,j+7] = reg.score(x_train, y_train)
        accuracy_valid[i,j+7] = reg.score(x_valid, y_valid)
    
# calculate r-squared mean across five fold for each lambda
average_accuracy_train = accuracy_train.mean(axis = 0)
average_accuracy_valid = accuracy_valid.mean(axis = 0)
    
# plot accuracy of training and testing dataset against parameter lambda
plt.figure(figsize=(10,5))
h = np.log(10**lambdas)
plt.plot(h, average_accuracy_train, 'bo-', label = 'train')
plt.plot(h, average_accuracy_valid, 'ro-', label = 'validation')
plt.title('Accuracy of Logistic regression with Lasso penalty');plt.xlabel('log(C)');plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.show()

In [148]:
from sklearn.ensemble import RandomForestClassifier


clfForest = RandomForestClassifier()
y = y.reshape(len(y),)
y.shape

# FIT THE TREE 
clf=clfForest.fit(x, y)

training_accuracy = clfForest.score(x, y)
y_pred = clfForest.predict(x_test)

In [158]:
feature_list = ['label']
RandomForest1 = pd.DataFrame(y_pred.astype(int), columns=feature_list)
RandomForest1.index = range(1, len(x_test)+1)
RandomForest1.to_csv("RandomForest1.csv", sep=',')