## Import Libraries

In [0]:
import numpy as np
from scipy import stats

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.externals import joblib
from google.colab import files

## Setup File Reading From Google Drive

In [0]:
!pip install -U -q PyDrive
# !gcloud config set project
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('./data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1FjWwOPe1_f3tSHEAg4FlDlX0-D2YjJc4' in parents"}).GetList()

fname = ''
for f in file_list:
  # 3. Create & download by id.
  if f['title'] == 'train_data_final_50k.csv':
    #print('title: %s, id: %s' % (f['title'], f['id']))
    fname = os.path.join(local_download_path, f['title'])
    #print('downloading to {}'.format(fname))
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

## Get the Data

In [3]:
DATA_FILE = fname
DATA_SIZE = 50000
num_data_points = 50000

skip_rows = DATA_SIZE - num_data_points

data = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=range(4,622))
labels = np.loadtxt(DATA_FILE, delimiter=',', skiprows=skip_rows, usecols=622)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, stratify=labels)

scaler = StandardScaler()
scaler.fit(X_train) # data = preprocessing.scale(data)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)



## Reduce Dimensionality

In [0]:
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=90)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

## Adaboost

In [0]:
DTC = DecisionTreeClassifier(min_samples_split=250, min_samples_leaf=50, max_depth=5, max_features='sqrt')
learning_rate_range = stats.expon(scale=0.1)
estimator_range = stats.randint(50,150)

parameters = {
    'n_estimators': estimator_range,
    'learning_rate': learning_rate_range
}

# ada = RandomizedSearchCV(AdaBoostClassifier(DTC), parameters, cv=5)
ada = AdaBoostClassifier(DTC, learning_rate=0.114, n_estimators=91)

### Evaluate Adaboost


In [48]:
%%time

ada.fit(X_train_svd, y_train)
#print("Best parameters:\t")
#print(ada.best_params_)
print("Adaboost score: %f" % ada.score(X_test_svd, y_test))

Adaboost score: 0.917133
CPU times: user 25.6 s, sys: 6 ms, total: 25.6 s
Wall time: 25.6 s


## Bagging

In [0]:
estimator_range = stats.randint(20, 100)
parameters = {
    'n_estimators': estimator_range
}

# bag = RandomizedSearchCV(BaggingClassifier(), parameters, cv=5)
bag = BaggingClassifier()

### Evaluate Bagging

In [69]:
%%time

bag.fit(X_train_svd, y_train)
#print("Best parameters:\t")
#print(bag.best_params_)
print('Bagging score: %f' % bag.score(X_test_svd, y_test))

Bagging score: 0.929600
CPU times: user 48.4 s, sys: 8 ms, total: 48.4 s
Wall time: 48.4 s


## Gradient Tree Boosting

In [0]:
learning_rate_range = stats.expon(scale=0.1)
estimator_range = stats.randint(20, 150)
max_depth_range = range(1, 4)

parameters = {
    'n_estimators': estimator_range,
    'learning_rate': learning_rate_range,
    'max_depth': max_depth_range
}

# gbc = RandomizedSearchCV(GradientBoostingClassifier(), parameters, cv=5)
gbc = GradientBoostingClassifier(learning_rate=0.27, max_depth=2, n_estimators=60)
gbd = GradientBoostingClassifier()

### Evaluate Gradient Tree Boosting

In [67]:
%%time

gbc.fit(X_train_svd, y_train)
# print("Best parameters:\t")
# print(gbc.best_params_)
print('Gradient Tree Boosting score: %f' % gbc.score(X_test_svd, y_test))

Gradient Tree Boosting score: 0.872733
CPU times: user 17.6 s, sys: 5 ms, total: 17.6 s
Wall time: 17.6 s


## Random Forests

In [0]:
parameters = { 'n_estimators': stats.randint(20, 100) }

# rfc = RandomizedSearchCV(RandomForestClassifier(), parameters, cv=5)
# rfc = RandomForestClassifier(n_estimators=46)
rfc = RandomForestClassifier()

### Evaluate Random Forests

In [53]:
%%time

rfc.fit(X_train_svd, y_train)
#print("Best parameters:")
#print(rfc.best_params_)
print('Random Forests score: %f' % rfc.score(X_test_svd, y_test))

Random Forests score: 0.920667
CPU times: user 3.94 s, sys: 2 ms, total: 3.94 s
Wall time: 3.95 s


## Extremely Randomized Trees

In [0]:
parameters = { 
    'n_estimators': stats.randint(20, 100)
#    'max_depth': range(1, 6),
#    'min_samples_split': stats.randint(100, 500),
#    'min_samples_leaf': stats.randint(1, 100)
}

# ert = RandomizedSearchCV(ExtraTreesClassifier(max_features='sqrt'), parameters, cv=5)
ert = ExtraTreesClassifier(max_features='sqrt', n_estimators=45)
ert = ExtraTreesClassifier()

### Evaluate Extremely Randomized Trees

In [63]:
%%time

ert.fit(X_train_svd, y_train)
#print("Best parameters:")
#print(ert.best_params_)
print('Random Forests score: %f' % ert.score(X_test_svd, y_test))

Random Forests score: 0.928067
CPU times: user 876 ms, sys: 2 ms, total: 878 ms
Wall time: 882 ms


### Save the Adaboost classifier

In [10]:
f = open('adaboost.txt', 'a')
f.write(str(ada.best_params_))
files.download('adaboost.txt')
f.close()

joblib.dump(ada, 'adaboost.pkl')
files.download('adaboost.pkl')

KeyboardInterrupt: ignored

### Save the Bagging classifier

In [0]:
f = open('bagging.txt', 'a')
f.write(str(bag.best_params_))
files.download('bagging.txt')
f.close()

joblib.dump(bag, 'bagging.pkl')
files.download('bagging.pkl')

### Save the Gradient Tree Boosting classifier

In [0]:
f = open('gradient_boosting.txt', 'a')
f.write(str(gbc.best_params_))
files.download('gradient_boosting.txt')
f.close()

joblib.dump(gbc, 'gradient_boosting.pkl')
files.download('gradient_boosting.pkl')

### Save the Extremely Randomized Trees classifier

In [0]:
f = open('extra_trees.txt', 'a')
f.write(str(ert.best_params_))
files.download('extra_trees.txt')
f.close()

joblib.dump(ert, 'extra_trees.pkl')
files.download('extra_trees.pkl')