## Import Libraries

In [0]:
import numpy as np
from scipy import stats
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [4]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 50000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)

scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)



## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Use Grid Search to find the best parameters

In [11]:
%%time

parameters = {
    # 'activation': ['identity', 'logistic', 'tanh', 'relu'],
    #'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': 10.0 ** -np.arange(1, 7),
    'batch_size': [500, 1000]
    #'learning_rate': ['constant', 'invscaling', 'adaptive']
}


# hidden_layer_sizes=(100,100,100) --> 3 hidden layers with 100 units each. default (100,)
clf = GridSearchCV(MLPClassifier(solver='adam'), param_grid=parameters, cv=5)
clf.fit(X_train, y_train)

print("best parameters:\t")
print(clf.best_params_)
print("Neural Network Score: %f" % clf.score(X_test, y_test))
# print( "Best score: %0.3f" % (clf.best_score_) )



KeyboardInterrupt: ignored

## Test Neural Network

In [0]:
#clf = MLPClassifier(solver='adam', alpha=0.0001, activation='relu', batch_size=5000)
clf = MLPClassifier(solver='adam') # default

In [15]:
%%time
clf.fit(X_train_svd, y_train)
print("score: %f" % clf.score(X_test_svd, y_test))

score: 0.956733
CPU times: user 35 s, sys: 15.9 s, total: 50.9 s
Wall time: 25.6 s


## Save the classifier

In [7]:
from sklearn.externals import joblib
from google.colab import files

'''
f = open('sk_neuralnet.txt', 'a')
f.write(str(clf.best_params_))
files.download('sk_neural_net.txt')
f.close()

joblib.dump(clf, 'sk_neural_net.pkl')
files.download('sk_neural_net.pkl')
'''

"\nf = open('sk_neuralnet.txt', 'a')\nf.write(str(clf.best_params_))\nfiles.download('sk_neural_net.txt')\nf.close()\n\njoblib.dump(clf, 'sk_neural_net.pkl')\nfiles.download('sk_neural_net.pkl')\n"