## Import Libraries

In [2]:
import numpy as np
from scipy import stats
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [4]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 50000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)


scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)



## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Create the model

In [22]:
def create_model():
  model = Sequential()
  model.add(Dense(units=100, input_dim=X_train_svd.shape[1], activation='relu')) # 85
  model.add(Dropout(0.02)) # 0.025
  model.add(Dense(units=50, activation='relu')) # 34
  model.add(Dropout(0.02)) 
  model.add(Dense(units=1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
  return model

CPU times: user 128 ms, sys: 3 ms, total: 131 ms
Wall time: 135 ms


## Evaluate the model

In [0]:
model = create_model()

In [23]:
%%time 

model.fit(X_train_svd, y_train, epochs=13, batch_size=200, verbose=0)
score = model.evaluate(X_test_svd, y_test, batch_size=200)

print("score: %f" % score[1])
print("loss: %f" % score[0])

score: 0.952333
loss: 0.133403
CPU times: user 16.5 s, sys: 2.79 s, total: 19.3 s
Wall time: 14.4 s


## Logistic Regression

In [0]:
log = Sequential()
log.add(Dense(units=1, input_dim=X_train_svd.shape[1], activation='sigmoid'))
log.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [25]:
%%time
log.fit(X_train_svd, y_train, epochs=15, batch_size=200, verbose=0) #, validation_data=(X_test, y_test))
score = log.evaluate(X_test_svd, y_test, batch_size=200)
print("score: %f" % score[1])
print("loss: %f" % score[0])

score: 0.823733
loss: 0.388636
CPU times: user 15 s, sys: 2.32 s, total: 17.4 s
Wall time: 12.4 s
