In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from time import time

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, average_precision_score

import nlp_utils as utils
from nlp_utils import get_vectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Bidirectional
from keras.utils.np_utils import to_categorical
from keras.callbacks import Callback, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras import metrics

# import tensorflow.python.util.deprecation as deprecation
# deprecation._PRINT_DEPRECATION_WARNINGS = False

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

pd.options.display.max_colwidth = 100

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vrozova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Set up
text = "entities"
class_names = ("Controls", "Self harm")

if len(class_names) == 2:
    average = "binary"
else:
    average = "macro"
    
# Undersampling
undersample = False
n_controls = 100000

In [3]:
df_train = pd.read_csv("../../data/rmh_train.csv")

print(df_train.SH.value_counts())
print()

triage_length = df_train[text].apply(lambda x: len(x.split())).max()
print("Max triage length:", triage_length)
num_words = 5000
print("Using %d words" % num_words)

0    307875
1      4302
Name: SH, dtype: int64

Max triage length: 67
Using 5000 words


In [4]:
def create_model():
    embed_dim = 50
    lstm_out = 25

    model = Sequential()
    model.add(Embedding(num_words, embed_dim, input_length=triage_length))
#     model.add(Bidirectional(LSTM(lstm_out)))
    model.add(LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', 
                  optimizer='adam')
    return model

# Model saving callback
ckpt_callback = ModelCheckpoint('models/keras_model', 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='auto')
# TensorBoard logs
logdir = "logs/" + datetime.now().strftime("%d%m%Y-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=logdir)

# Custom metrics
class CustomMetrics(Callback):
    def __init__(self, val_data):
        super().__init__()
        self.validation_data = val_data

    def on_epoch_end(self, epoch, logs={}):
        y_proba = self.model.predict(self.validation_data[0])
        if y_proba.shape[1] == 1:
            y_pred = np.where(y_proba > 0.5, 1, 0)
        else:
            y_pred = np.argmax(y_proba, axis=1)
        y = self.validation_data[1]
        
        logs['val_precision'] = precision_score(y, y_pred, average=average)
        logs['val_recall'] = recall_score(y, y_pred, average=average)
        logs['val_f1'] = f1_score(y, y_pred, average=average)
#         logs['val_f2'] = fbeta_score(y, y_pred, average=average, beta=2)
        logs['val_ap'] = average_precision_score(y, y_proba)

        return

In [5]:
model = create_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 67, 50)            250000    
_________________________________________________________________
lstm (LSTM)                  (None, 25)                7600      
_________________________________________________________________
dense (Dense)                (None, 1)                 26        
Total params: 257,626
Trainable params: 257,626
Non-trainable params: 0
_________________________________________________________________


**No CV**

In [6]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42, stratify=df_train.SH)

In [7]:
tokenizer = Tokenizer(num_words=num_words, filters="", split=" ")
tokenizer.fit_on_texts(df_train[text].values)

X_train = tokenizer.texts_to_sequences(df_train[text].values)
X_train = pad_sequences(X_train, maxlen=triage_length)

y_train = df_train.SH.values

class_weight = dict(zip((0,1), y_train.shape[0] / (len(class_names) * np.bincount(y_train))))

X_val = tokenizer.texts_to_sequences(df_val[text].values)
X_val = pad_sequences(X_val, maxlen=triage_length)

y_val = df_val.SH.values

X_train.shape

(280959, 67)

In [8]:
%%time
n_epochs = 10
batch_size = 64

metrics = CustomMetrics((X_val, y_val))

history = model.fit(X_train, y_train, 
                    epochs=n_epochs, 
                    batch_size=batch_size, 
                    validation_data=(X_val, y_val), 
                    callbacks=[metrics], 
                    class_weight=class_weight,
                    workers=2,
                    verbose=1,
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1h 2min 36s, sys: 2min 39s, total: 1h 5min 15s
Wall time: 40min 58s


In [9]:
print(" AP score: %0.3f" % np.max(history.history["val_ap"]))

 AP score: 0.856


In [10]:
history.history

{'loss': [0.16567039489746094,
  0.08669167011976242,
  0.06305905431509018,
  0.05109117552638054,
  0.04311368986964226,
  0.03767296299338341,
  0.03355661407113075,
  0.032605115324258804,
  0.028774309903383255,
  0.027943145483732224],
 'val_loss': [0.10312473773956299,
  0.06511218100786209,
  0.0922049731016159,
  0.05740988627076149,
  0.08994237333536148,
  0.04582354053854942,
  0.039464615285396576,
  0.055877696722745895,
  0.059398118406534195,
  0.04582565277814865],
 'val_precision': [0.2982078853046595,
  0.3856332703213611,
  0.30307467057101023,
  0.4091816367265469,
  0.30354505169867063,
  0.45601851851851855,
  0.5104438642297651,
  0.4091836734693878,
  0.4064711830131446,
  0.5006451612903225],
 'val_recall': [0.9674418604651163,
  0.9488372093023256,
  0.9627906976744186,
  0.9534883720930233,
  0.9558139534883721,
  0.9162790697674419,
  0.9093023255813953,
  0.9325581395348838,
  0.9348837209302325,
  0.9023255813953488],
 'val_f1': [0.4558904109589041,
  0.5

In [None]:
plt.rcParams['figure.figsize'] = (10, 8)
plt.plot(range(n_epochs), history.history["loss"], label="Train loss");
plt.plot(range(n_epochs), history.history["val_loss"], label="Val loss");
plt.xlabel("Epochs");
plt.legend();

In [None]:
plt.rcParams['figure.figsize'] = (10, 8)
plt.plot(range(n_epochs), history.history["val_ap"], label="Val AP");
plt.xlabel("Epochs");
plt.legend();

In [None]:
history.history["val_loss"]

In [None]:
# model = load_model('models/keras_model_1000')
y_proba = model.predict(X_val)
y_val = df_val.SH.values

In [None]:
print("Final model: %s" % model)
print("Using %d features." % num_words)
if undersample:
    print("Trained on %d controls.\n" % n_controls)
utils.evaluate_model(y_val, y_proba, class_names, "validation", digits=3)

**CV**

In [6]:
tokenizer = Tokenizer(num_words=num_words, filters="", split=" ")
tokenizer.fit_on_texts(df_train[text].values)

X = tokenizer.texts_to_sequences(df_train[text].values)
X = pad_sequences(X, maxlen=triage_length)

y = df_train.SH.values

class_weight = dict(zip((0,1), y.shape[0] / (len(class_names) * np.bincount(y))))

X.shape

(312177, 67)

In [7]:
%%time
n_splits = 3
n_epochs = 10
batch_size = 64

cv = StratifiedKFold(n_splits=n_splits)

cv_history = []

for train_index, val_index in cv.split(X, y):
    model=create_model()
    
    metrics = CustomMetrics((X[val_index], y[val_index]))
    
    history = model.fit(X[train_index], y[train_index], 
                    epochs=n_epochs, 
                    batch_size=batch_size, 
                    validation_data=(X[val_index], y[val_index]), 
                    callbacks=[metrics], 
                    class_weight=class_weight,
                    workers=2,
                    verbose=2,
                   )
    
    cv_history.append(history.history)

Epoch 1/10
3252/3252 - 340s - loss: 0.1822 - val_loss: 0.1252
Epoch 2/10
3252/3252 - 340s - loss: 0.0794 - val_loss: 0.0989
Epoch 3/10
3252/3252 - 338s - loss: 0.0541 - val_loss: 0.0845
Epoch 4/10
3252/3252 - 333s - loss: 0.0424 - val_loss: 0.0660
Epoch 5/10
3252/3252 - 312s - loss: 0.0353 - val_loss: 0.0499
Epoch 6/10
3252/3252 - 306s - loss: 0.0314 - val_loss: 0.0741
Epoch 7/10
3252/3252 - 308s - loss: 0.0281 - val_loss: 0.0459
Epoch 8/10
3252/3252 - 307s - loss: 0.0226 - val_loss: 0.0426
Epoch 9/10
3252/3252 - 304s - loss: 0.0224 - val_loss: 0.0613
Epoch 10/10
3252/3252 - 304s - loss: 0.0183 - val_loss: 0.0447
Epoch 1/10
3252/3252 - 313s - loss: 0.1750 - val_loss: 0.1352
Epoch 2/10
3252/3252 - 309s - loss: 0.0858 - val_loss: 0.1023
Epoch 3/10
3252/3252 - 308s - loss: 0.0595 - val_loss: 0.0533
Epoch 4/10
3252/3252 - 303s - loss: 0.0437 - val_loss: 0.0571
Epoch 5/10
3252/3252 - 297s - loss: 0.0394 - val_loss: 0.0427
Epoch 6/10
3252/3252 - 298s - loss: 0.0323 - val_loss: 0.0417
Epoch 7

In [None]:
for i in range(n_splits):
    sns.lineplot(x=range(1,n_epochs+1), y=cv_history[i]['val_recall']);
    
plt.ylabel("Recall");
plt.xlabel("Epochs");
plt.xticks((1,2,3));
plt.title("Five-fold cross-validation");
plt.ylim([0.8, 1]);
plt.savefig("cv_rec.png", bbox_inches='tight', dpi=300, transparent=False, pad_inches=0);

In [10]:
def get_final_score(metric):
    scores = []
    for i in range(n_splits):
        scores.append(cv_history[i][metric][-1])
    return np.array(scores)

print("Average Precision: %0.3f (+/- %0.2f)" % (get_final_score("val_precision").mean(), 
                                                get_final_score("val_precision").std() * 2))
print("Average Recall: %0.3f (+/- %0.2f)" % (get_final_score("val_recall").mean(), 
                                             get_final_score("val_recall").std() * 2))
print("Average F1 score: %0.3f (+/- %0.2f)" % (get_final_score("val_f1").mean(), 
                                               get_final_score("val_f1").std() * 2))
print("Average AP score: %0.3f (+/- %0.2f)" % (get_final_score("val_ap").mean(), 
                                               get_final_score("val_ap").std() * 2))

Average Precision: 0.560 (+/- 0.08)
Average Recall: 0.874 (+/- 0.02)
Average F1 score: 0.682 (+/- 0.05)
Average AP score: 0.801 (+/- 0.04)


In [None]:
get_final_score("loss"), get_final_score("loss").mean(), get_final_score("loss").std()

In [None]:
get_final_score("val_loss"), get_final_score("val_loss").mean(), get_final_score("val_loss").std()

In [None]:
cv_history

In [None]:
%load_ext tensorboard

> I don't know if it's a thing but can I train another model to reclassify presentations predicted as either SI or SH? Which dataset would I use for it, training or validation? 

In [None]:
%tensorboard --logdir logs/

In [None]:
y_pred = np.argmax(y_proba, axis=1)
y_pred.shape, (y_pred > 0).sum()

In [None]:
df_pred_cases = df_val[y_pred > 0].copy()
df_pred_cases["y_pred_1"] = y_pred[y_pred > 0]
df_pred_cases.head()

In [None]:
y = df_pred_cases.y.values

vectorizer = FeatureSelector(params)
X = vectorizer.fit_transform(df_pred_cases[data], y)

In [None]:
clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")
utils.benchmark_cv_score(clf, X, y, class_names)

In [None]:
vectorizer = FeatureSelector(params)
pipe = make_pipeline(vectorizer, clf)
pipe.fit(df_pred_cases[data], y)
y_proba = pipe.predict_proba(df_pred_cases[data])
utils.evaluate_model(y, y_proba, class_names, "training")