In [1]:
import nltk
import numpy as np
import pandas as pd
from pandas import DataFrame
from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.core import Activation, Dropout, Dense, Flatten
from keras.layers import TimeDistributed, Bidirectional, InputLayer, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import TensorBoard
from keras.metrics import categorical_accuracy
from IPython.display import clear_output
from more_itertools import flatten, intersperse
import random

import os
import urllib.request

Using TensorFlow backend.


In [2]:
%load_ext autoreload
%autoreload 2
from batcher import batch_from_generator
from train_data import load_conll2003, create_conll_encoded_shifted_generator
from mappings import get_all_mappings, gen_input_feature_to_class_map, gen_input_feature_to_int_map
from corpus import corpus_training_data_generator, create_all_corpus_train_pipeline, pad, encode_each_sentence
from model import create_model, compile_model
from metrics import F1Score

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [17]:
DROPOUT = 0.5
TIME_SLICE_SIZE = 100
BATCH_SIZE = 512
SAMPLING_RATE = 1
OUTPUT_CLASSES = 2
PADDING = 0
UNKNOWN = 1
NUM_OF_UNITS = 300
EPOCHS=100
MODEL_SAVE_PATH = 'tc_model.h5'
LSTM_MODEL_SAVE_PATH = 'lstm_model.h5'

In [4]:
mapping, reverse_mapping, lower_mapping, lower_reverse_mapping = get_all_mappings()
input_feature_to_class_map = gen_input_feature_to_class_map()
input_feature_to_int_map = gen_input_feature_to_int_map()

In [5]:
model = create_model(1, 2, NUM_OF_UNITS, DROPOUT)

In [6]:
model = compile_model(model)

In [7]:
if os.path.isfile(MODEL_SAVE_PATH): 
    model.load_weights(MODEL_SAVE_PATH)

In [8]:
tensor_board = TensorBoard(batch_size=BATCH_SIZE, write_graph=False, )

In [9]:
X, Y = create_all_corpus_train_pipeline(TIME_SLICE_SIZE, 'train')
X_val, Y_val = create_all_corpus_train_pipeline(TIME_SLICE_SIZE, 'validation')

In [10]:
def get_sample_weights(Ys):
    W = Ys * 49
    W = W + 1
    W = W.reshape((-1, TIME_SLICE_SIZE))
    return W


In [11]:
W = get_sample_weights(Y)
W_val = get_sample_weights(Y_val)

In [12]:
f1_score = F1Score()

In [None]:

#g = corpus_training_data_generator('gutenberg',TIME_SLICE_SIZE, BATCH_SIZE, shift=False)
model.fit(X, Y, validation_data=(X_val, Y_val, W_val), callbacks=[tensor_board, f1_score], batch_size=BATCH_SIZE, epochs=EPOCHS, sample_weight=W)

Train on 221877 samples, validate on 22367 samples
Epoch 1/100
- precision: 0.07540607876559209 - recall: 0.7941121075583502 - f1 score: 0.1377334737170096
Epoch 2/100
- precision: 0.08015631840128327 - recall: 0.7268591628877331 - f1 score: 0.14438968233982086
Epoch 3/100
- precision: 0.09050161614154581 - recall: 0.567506483346131 - f1 score: 0.1561082726903181
Epoch 4/100
- precision: 0.11985678229735282 - recall: 0.4733494632224041 - f1 score: 0.19127965692372992
Epoch 5/100
- precision: 0.19118839078648145 - recall: 0.49208234936180695 - f1 score: 0.2753825884260667
Epoch 6/100
- precision: 0.19619870156575894 - recall: 0.48380974457789433 - f1 score: 0.27918136672964705
Epoch 7/100
- precision: 0.27702536111748755 - recall: 0.4720745258826768 - f1 score: 0.349156683311545
Epoch 8/100
- precision: 0.21739313185658443 - recall: 0.5245208119032786 - f1 score: 0.30738665308201735
Epoch 9/100
- precision: 0.30118570295246466 - recall: 0.4070237457079524 - f1 score: 0.3461962649644795


In [14]:
original_sentence = "Tim Smith works for Google in California. Towards summer the weather in London gets really warm".lower()
test_sentence = pad([list(original_sentence)], len(original_sentence))
test_sentence = encode_each_sentence(test_sentence, input_feature_to_int_map)

mapped_sentence = np.asarray(test_sentence)
predicted_result = model.predict_classes(mapped_sentence)[0]
print(predicted_result)
predicted_result = list(zip(pad([list(original_sentence)], len(original_sentence))[0], predicted_result.tolist()))

def true_case(letter, label):
    if letter == 0:
        return ''
    if(label == 0):
        return letter.lower()
    if (label == 1):
        return letter.upper()
    return letter

predicted_result = [true_case(letter, label) for letter, label in predicted_result]
''.join(predicted_result) 



[1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]


'Tim Smith Works For Google in california. Towards Summer The Weather in London Gets Really Warm'

In [15]:
model.save("big_lstm.h5")