In [1]:
!pip install keras



In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import seaborn

In [None]:
seaborn.set_context('talk')
seaborn.set_style('white')

# Load Dataset

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/danybol/gft_ml_example/master/worked_example/classification.csv")

In [None]:
data.head()

In [None]:
y = data['y'].values
X = data.drop(['y'], axis=1).values

In [None]:
scatter(X[:, 0], X[:, 1], c=y, cmap=cm.Accent)

## Split into train and test

In [None]:
def train_test_split(X, y):
    N = X.shape[0]
    split_size = int(N/5)
    split = int(N - 2*split_size)
    train_X = X[:split]
    train_y = y[:split]
    val_X = X[split:split+split_size]
    val_y = y[split:split+split_size]
    test_X = X[split+split_size:]
    test_y = y[split+split_size:]
    
    return train_X, train_y, val_X, val_y, test_X, test_y

In [None]:
train_X, train_y, val_X, val_y, test_X, test_y = train_test_split(X, y)

### Set up framework for testing models

In [None]:
def test_model(model, params, train_X, train_y, test_X, test_y):
    scorer = make_scorer(f1_score, greater_is_better=True) # Use mean squared error as score
    gs = GridSearchCV(model, params, scoring=scorer) # Cross-validation to pick best hyperparameter
    gs.fit(train_X, train_y)
    gs.best_estimator_.fit(train_X, train_y)
    train_pred = gs.best_estimator_.predict(train_X) # Make prediction on training set
    test_pred = gs.best_estimator_.predict(test_X) # Make prediction on test set
    print("Training MSE: ", f1_score(train_y, train_pred))
    print("Test MSE: ", f1_score(test_y, test_pred))
    return gs.best_estimator_

#### Scikit-learn pipelines are a good way of keeping code clean. They let you easily swap out models and preprocessing steps

In [None]:
model = Pipeline([('features', None),
                  ('preprocess', None),
                  ('model', None)])



### Try out different models

#### Lasso with no other features

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
params = dict(model=[DummyClassifier()], features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


In [None]:
params = dict(model=[LinearSVC(class_weight='balanced')], model__C=np.logspace(-2, 3, num=10), features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_y==0, 0], val_X[val_y==0, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_pred, cmap=cm.Accent)


In [None]:
params = dict(model=[SVC(kernel='rbf', class_weight='balanced')], model__C=np.logspace(-2, 3), features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_y==0, 0], val_X[val_y==0, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_pred, cmap=cm.Accent)


In [None]:
params = dict(model=[RandomForestClassifier()], features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_y==0, 0], val_X[val_y==0, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_pred, cmap=cm.Accent)


### Try deep learning to avoid manual feature engineering

In [None]:

from keras import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.sequence import TimeseriesGenerator

seq_len = 100 # Length of autocorrelation from plot above

N = train_y.shape[0]
split = int(N - N/4)
new_train_y = train_y[:split]
sub_val_y = train_y[split:]


In [None]:
neurons = 10
n_input=seq_len
n_features = 1
model = Sequential()
model.add(Dense(30, input_dim=train_X.shape[1], activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = model.fit(train_X, train_y, epochs=40, batch_size=10, validation_split=0.4, verbose=0)


In [None]:
plot(history.history['val_loss'])

In [None]:
model_pred = model.predict_classes(val_X)[:, 0]

In [None]:
f1_score(val_y, model_pred)

In [None]:
scatter(val_X[val_y==0, 0], val_X[val_y==0, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_pred, cmap=cm.Accent)


# Test the best model on the test dataset

#### Finally, test the model we have chosen on an unseen set of data. Hopefully the model will generalise well

In [None]:
test_pred = model.predict_classes(test_X)
f1_score(test_y, test_pred)