In [1]:
import nltk
import numpy as np
import pandas as pd
from pandas import DataFrame
from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.core import Activation, Dropout, Dense, Flatten
from keras.layers import TimeDistributed, Bidirectional, InputLayer, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import TensorBoard
from keras.metrics import categorical_accuracy
from IPython.display import clear_output
from more_itertools import flatten, intersperse
import random

import os
import urllib.request

Using TensorFlow backend.


In [2]:
%load_ext autoreload
%autoreload 2
from batcher import batch_from_generator
from train_data import load_conll2003, create_conll_encoded_shifted_generator
from mappings import get_all_mappings, gen_input_feature_to_class_map, gen_input_feature_to_int_map
from corpus import corpus_training_data_generator, create_all_corpus_train_pipeline, pad, encode_each_sentence
from model import create_model, compile_model
from metrics import F1Score

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Todd\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [54]:
DROPOUT = 0.25
TIME_SLICE_SIZE = 100
BATCH_SIZE = 512
SAMPLING_RATE = 1
OUTPUT_CLASSES = 2
PADDING = 0
UNKNOWN = 1
NUM_OF_UNITS = 1000
EPOCHS=10000
MODEL_SAVE_PATH = 'tc_model.h5'
LSTM_MODEL_SAVE_PATH = 'lstm_model.h5'

In [4]:
mapping, reverse_mapping, lower_mapping, lower_reverse_mapping = get_all_mappings()
input_feature_to_class_map = gen_input_feature_to_class_map()
input_feature_to_int_map = gen_input_feature_to_int_map()

In [51]:
model = create_model(1, 2, NUM_OF_UNITS, DROPOUT)

In [52]:
model = compile_model(model)

In [7]:
if os.path.isfile(MODEL_SAVE_PATH): 
    model.load_weights(MODEL_SAVE_PATH)

In [8]:
tensor_board = TensorBoard(batch_size=BATCH_SIZE, write_graph=False, )

In [9]:
X, Y, W = create_all_corpus_train_pipeline(TIME_SLICE_SIZE, 'train')

In [10]:
X_val, Y_val, W_val = create_all_corpus_train_pipeline(TIME_SLICE_SIZE, 'validation')

In [11]:
# def get_sample_weights(Ys):
#     W = Ys * 49
#     W = W + 1
#     W = W.reshape((-1, TIME_SLICE_SIZE))
#     return W


In [12]:
# W = get_sample_weights(Y)
# W_val = get_sample_weights(Y_val)

In [55]:
f1_score = F1Score()

In [56]:

#g = corpus_training_data_generator('gutenberg',TIME_SLICE_SIZE, BATCH_SIZE, shift=False)
model.fit(X, Y, validation_data=(X_val, Y_val, W_val), callbacks=[tensor_board, f1_score], batch_size=BATCH_SIZE, epochs=EPOCHS, sample_weight=W)

Train on 221877 samples, validate on 22367 samples
Epoch 1/10000
- precision: 0.06791037095745708 - recall: 0.9273140837112267 - f1 score: 0.12655284568920347
Epoch 2/10000
- precision: 0.06415110144043884 - recall: 0.9542326470886516 - f1 score: 0.12022005541541482
Epoch 3/10000
  7168/221877 [..............................] - ETA: 8:31 - loss: 0.0283 - acc: 0.5963

KeyboardInterrupt: 

In [48]:
original_sentence = "Tim Smith works for Google in California. Towards summer the weather in London gets really warm".lower()
test_sentence = pad([list(original_sentence)], len(original_sentence))
test_sentence = encode_each_sentence(test_sentence, input_feature_to_int_map)

mapped_sentence = np.asarray(test_sentence)
predicted_result = model.predict_classes(mapped_sentence)[0]
print(predicted_result)
predicted_result = list(zip(pad([list(original_sentence)], len(original_sentence))[0], predicted_result.tolist()))

def true_case(letter, label):
    if letter == 0:
        return ''
    if(label == 0):
        return letter.lower()
    if (label == 1):
        return letter.upper()
    return letter

predicted_result = [true_case(letter, label) for letter, label in predicted_result]
''.join(predicted_result) 



[1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]


'Tim SMith Works FoR Google In CaliforNia. TOWards SUmmer The Weather In London Gets Really Warm'

In [None]:
model.save("big_lstm.h5")

In [None]:
print(Y.shape)
print(W.shape)

In [None]:
W

In [53]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_11 (Bidirectio (None, None, 2000)        8016000   
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 2)           4002      
_________________________________________________________________
activation_19 (Activation)   (None, None, 2)           0         
Total params: 8,020,002
Trainable params: 8,020,002
Non-trainable params: 0
_________________________________________________________________
