### Set up scikit learn and testing framework

In [None]:
%pylab inline

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

np.random.seed(12345)

In [None]:
def train_test_split(X, y):
    N = X.shape[0]
    split_size = int(N/5)
    split = int(N - 2*split_size)
    train_X = X[:split]
    train_y = y[:split]
    val_X = X[split:split+split_size]
    val_y = y[split:split+split_size]
    test_X = X[split+split_size:]
    test_y = y[split+split_size:]
    
    return train_X, train_y, val_X, val_y, test_X, test_y


def test_model(model, params, train_X, train_y, test_X, test_y):
    scorer = make_scorer(f1_score, greater_is_better=True) # Use mean squared error as score
    gs = GridSearchCV(model, params, scoring=scorer, cv=StratifiedKFold(n_splits=5, shuffle=True)) # Cross-validation to pick best hyperparameter
    gs.fit(train_X, train_y)
    gs.best_estimator_.fit(train_X, train_y)
    train_pred = gs.best_estimator_.predict(train_X) # Make prediction on training set
    test_pred = gs.best_estimator_.predict(test_X) # Make prediction on test set
    print("Training score: ", f1_score(train_y, train_pred))
    print("Test score: ", f1_score(test_y, test_pred))
    print("Test accuracy: ", accuracy_score(test_y, test_pred))
    return gs.best_estimator_

### Tasks

1. Find the best model for classifying whether text is about the UK
2. Find the best model for classifying whether text is about earnings
3. Advanced: find which words are the most important for determining if text is UK or earnings

### Load the data

#### uk.csv is a dataset of Reuters news reports, labelled as to whether or not they are about the UK
#### earnings.csv is a dataset of Reuters new reports, labelled as to whether or not they are about earnings

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/danybol/gft_ml_example/master/worked_example/uk.csv")
# data = pd.read_csv("https://raw.githubusercontent.com/danybol/gft_ml_example/master/worked_example/earnings.csv")

In [None]:
data.head()

### Very simple feature extraction

In [None]:
X = (data.text.str.contains('united kingdom') | data.text.str.contains('uk')).astype(int).values[:, np.newaxis]
y = data['is_uk'].values

In [None]:
train_X, train_y, val_X, val_y, test_X, test_y = train_test_split(X, y)

In [None]:
model = Pipeline([('features', None),
                  ('preprocess', None),
                  ('model', None)])


In [None]:
params = dict(model=[LinearSVC(class_weight='balanced')], model__C=np.logspace(-1, 1))
best_model = test_model(model, params, train_X, train_y, val_X, val_y)

In [None]:
best_model.fit(np.vstack((train_X, val_X)), np.hstack((train_y, val_y)))
f1_score(test_y, best_model.predict(test_X))