In [None]:
!pip install keras

In [None]:
%pylab inline

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import seaborn
np.random.seed(12345)

In [None]:
seaborn.set_context('talk')
seaborn.set_style('white')

# Load Dataset

#### Load some example data and take a look at it

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/danybol/gft_ml_example/master/worked_example/classification.csv")

In [None]:
data.head()

In [None]:
y = data['y'].values
X = data.drop(['y'], axis=1).values

In [None]:
scatter(X[:, 0], X[:, 1], c=y, cmap=cm.Accent)

#### Seems to be a bit of a pattern, but quite noisy

## Split into train and test

In [None]:
def train_test_split(X, y):
    N = X.shape[0]
    split_size = int(N/5)
    split = int(N - 2*split_size)
    train_X = X[:split]
    train_y = y[:split]
    val_X = X[split:split+split_size]
    val_y = y[split:split+split_size]
    test_X = X[split+split_size:]
    test_y = y[split+split_size:]
    
    return train_X, train_y, val_X, val_y, test_X, test_y

In [None]:
train_X, train_y, val_X, val_y, test_X, test_y = train_test_split(X, y)

### Set up framework for testing models

In [None]:
def test_model(model, params, train_X, train_y, test_X, test_y):
    scorer = make_scorer(f1_score, greater_is_better=True) # Use f1 as score
    gs = GridSearchCV(model, params, scoring=scorer, cv=StratifiedKFold(n_splits=5)) # Cross-validation to pick best hyperparameter
    gs.fit(train_X, train_y)
    gs.best_estimator_.fit(train_X, train_y)
    train_pred = gs.best_estimator_.predict(train_X) # Make prediction on training set
    test_pred = gs.best_estimator_.predict(test_X) # Make prediction on test set
    print("Training F1 Score: ", f1_score(train_y, train_pred))
    print("Test F1 Score: ", f1_score(test_y, test_pred))
    return gs.best_estimator_

#### Scikit-learn pipelines are a good way of keeping code clean. They let you easily swap out models and preprocessing steps

In [None]:
model = Pipeline([('features', None),
                  ('preprocess', None),
                  ('model', None)])



### Try out different models

#### Dummy classifier as a baseline

In [None]:
params = dict(model=[DummyClassifier()], features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


#### Points are randomly labelled, so this is what we have to beat

#### Let's see how a linear model does

In [None]:
params = dict(model=[LinearSVC(class_weight='balanced')], model__C=np.logspace(0, 3, num=10), features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


#### A lot better than random, but still making mistakes. Can see the clear linear boundary

#### Let's see if a nonlinear model is better

In [None]:
params = dict(model=[SVC(kernel='rbf', class_weight='balanced', gamma='auto')], model__C=np.logspace(-1, 3, num=10), features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)


In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


#### Better, and we can see a nonlinear boundary

#### Let's compare to a random forest

In [None]:
params = dict(model=[RandomForestClassifier(n_estimators=100)], features=[None], preprocess=[None])
simple_model = test_model(model, params, train_X, train_y, val_X, val_y)
val_pred = simple_model.predict(val_X)

In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


#### Boundary is less clear

### Try deep learning to avoid manual feature engineering

In [None]:

from keras import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.sequence import TimeseriesGenerator

seq_len = 100 # Length of autocorrelation from plot above

N = train_y.shape[0]
split = int(N - N/4)
new_train_y = train_y[:split]
sub_val_y = train_y[split:]


#### Split out a smaller validation set from training data

In [None]:
neurons = 10
n_input=seq_len
n_features = 1
dl_model = Sequential()
dl_model.add(Dense(30, input_dim=train_X.shape[1], activation='relu'))
dl_model.add(Dense(30, activation='relu'))
dl_model.add(Dense(30, activation='relu'))
dl_model.add(Dense(1, activation='sigmoid'))

dl_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = dl_model.fit(train_X, train_y, epochs=30, batch_size=10, validation_split=0.4, verbose=0)


#### Show loss history. Should be going down

In [None]:
plot(history.history['val_loss'])

In [None]:
model_pred = (dl_model.predict(val_X) > 0.5).astype("int32")[:, 0]

In [None]:
f1_score(val_y, model_pred)

In [None]:
scatter(val_X[val_pred==1, 0], val_X[val_pred==1, 1], marker='x', c='r', s=200)
scatter(val_X[:, 0], val_X[:, 1], c=val_y, cmap=cm.Accent)


#### It has captured the nonlinear boundary

# Test the best model on the test dataset

#### Finally, test the model we have chosen on an unseen set of data. Hopefully the model will generalise well

In [None]:
test_pred = (dl_model.predict(test_X) > 0.5).astype("int32")
f1_score(test_y, test_pred)