## Import Libraries

In [0]:
import numpy as np
import scipy.stats

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC

## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [3]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 5000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)

scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)




## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Randomized Search to find the optimal parameters

In [19]:
random_parameters = {
    #'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'C': scipy.stats.expon(scale=10),
    'gamma': scipy.stats.expon(scale=0.01)
}

clf = RandomizedSearchCV(SVC(kernel='rbf'), random_parameters, cv=5, scoring='accuracy')
clf.fit(X_train_svd, y_train)
print("Best parameters:\t")
print(clf.best_params_)

CPU times: user 1e+03 µs, sys: 0 ns, total: 1e+03 µs
Wall time: 1.45 ms


## Evaluate the classifier

In [0]:
clf = SVC(C=14.24, gamma=0.015, kernel='rbf')

In [20]:
%%time
clf.fit(X_train_svd, y_train)
print("SVM score: %f" % clf.score(X_test_svd, y_test))

SVM score: 0.912000
CPU times: user 1.31 s, sys: 1e+03 µs, total: 1.32 s
Wall time: 1.32 s


## Save the classifier

In [0]:
from sklearn.externals import joblib
from google.colab import files

f = open('svm.txt', 'a')
f.write(str(clf.best_params_))
files.download('svm.txt')
f.close()

joblib.dump(clf, 'svm.pkl')
files.download('svm.pkl')