## Import Libraries

In [0]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [3]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 50000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)

scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

N = 50000 * 0.8 # size of the training set



## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Use Grid Search to find the best parameters

In [27]:
alpha_range = 10.0 ** -np.arange(1,7)
n_iter = np.ceil(10**6 / N)
loss_functions = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalties = ['l1', 'l2', 'elasticnet']

parameters = {
    'loss': loss_functions,
    'penalty': penalties,
    'alpha': alpha_range
}

clf = GridSearchCV(SGDClassifier(max_iter=n_iter), param_grid=parameters, cv=5)
clf.fit(X_train, y_train)
print("best parameters:\t")
print(clf.best_params_)
print( "Best score: %0.3f" % (clf.best_score_) )

CPU times: user 1.32 s, sys: 1 ms, total: 1.32 s
Wall time: 1.34 s


## Evaluate the Classifier

In [0]:
clf = SGDClassifier(max_iter=n_iter, alpha=0.01, loss='hinge', penalty='l2')

In [28]:
%%time
clf.fit(X_train, y_train)
print("score: %f" % clf.score(X_test, y_test))

score: 0.824400
CPU times: user 1.34 s, sys: 8 ms, total: 1.34 s
Wall time: 1.34 s


## Save the classifier

In [0]:
from sklearn.externals import joblib
from google.colab import files

f = open('sgd.txt', 'a')
f.write(str(clf.best_params_))
files.download('sgd.txt')
f.close()

joblib.dump(clf, 'sgd.pkl')
files.download('sgd.pkl')