## Import Libraries

In [0]:
import numpy as np
from scipy import stats

from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.externals import joblib
from google.colab import files

## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [4]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 50000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)

scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)



## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Mini-Batch K-Nearest Neighbors

In [0]:
parameters = {
    'n_neighbors': stats.randint(1, 50),
    'algorithm': ['ball_tree', 'kd_tree']
}

#knn = RandomizedSearchCV(KNeighborsClassifier(), parameters, cv=5)
#knn = KNeighborsClassifier(n_neighbors=2, algorithm='ball_tree')

### Evaluate

In [0]:
knn = KNeighborsClassifier()

In [9]:
%%time

knn.fit(X_train_svd, y_train)
print("KNN score: %f" % knn.score(X_test_svd, y_test))

KNN score: 0.938400
CPU times: user 12.6 s, sys: 0 ns, total: 12.6 s
Wall time: 12.6 s


## Nearest Centroids

In [0]:
centroid = NearestCentroid()

In [14]:
%%time
centroid.fit(X_train_svd, y_train)
print("Nearest Centroids score: %f" % centroid.score(X_test_svd, y_test))

Nearest Centroids score: 0.694867
CPU times: user 24 ms, sys: 1e+03 µs, total: 25 ms
Wall time: 26.3 ms


## Save the KNN classifier

In [9]:
'''
f = open('knn.txt', 'a')
f.write(str(knn.best_params_))
files.download('knn.txt')
f.close()

joblib.dump(clf, 'knn.pkl')
files.download('knn.pkl')
'''

"\nf = open('knn.txt', 'a')\nf.write(str(knn.best_params_))\nfiles.download('knn.txt')\nf.close()\n\njoblib.dump(clf, 'knn.pkl')\nfiles.download('knn.pkl')\n"

## Save the Nearest Centroids classifier

In [10]:
f = open('nearest_centroids.txt', 'a')
f.write(str(centroid.best_params_))
files.download('nearest_centroids.txt')
f.close()

joblib.dump(clf, 'nearest_centroids.pkl')
files.download('nearest_centroids.pkl')

AttributeError: ignored