In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, GRU
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from __future__ import print_function
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

Using Theano backend.
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, CuDNN 4007)


In [2]:
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format("/home/tong/Documents/python/GoogleNews-vectors-negative300.bin.gz", binary = True)
print("done" + " loading")

done loading


In [3]:
class CharacterTable(object):
    '''
        Given a set of characters:
        + Encode them to a one hot integer representation
        + Decode the one hot integer representation to their character output
        + Decode a vector of probabilties to their character output
        '''
    def __init__(self, vocab, maxlen, wv):
        self.vocab = vocab
        self.char_indices = dict((c, i) for i, c in enumerate(self.vocab))
        self.indices_char = dict((i, c) for i, c in enumerate(self.vocab))
        self.maxlen = maxlen
        self.wv = wv
        self.embedding = {}
        for i, c in enumerate(self.vocab):
            if c in wv:
                self.embedding[c] = wv[c]
            else:
                self.embedding[c] = np.random.rand(300)
    
    def encode_onehot(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.vocab)))
        for i, c in enumerate(C):
            try:
                X[i, self.char_indices[c]] = 1
            except KeyError:
                X[i, self.char_indices[' ']] = 1
        return X
    
    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, 300))
        for i, c in enumerate(C):
            try:
                X[i] = self.embedding[c]
            except KeyError:
                X[i] = np.random.rand(300)
        return X
    
    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ' '.join(self.indices_char[x] for x in X)

In [4]:
import re
token_pattern=r"(?u)\b\w\w+\b"
def build_tokenizer():
    """Return a function that splits a string into a sequence of tokens"""
    pattern = re.compile(token_pattern)
    return lambda doc: pattern.findall(doc)


def readData(src):
    b1 = []
    with open(src) as p:
        for i, line in enumerate(p):
            s = line.split('\t')
            b1.append(s[2].strip())
            lines = i + 1
    return b1, lines

In [32]:
b1, lines = readData('./dataset/normal.aligned')
b2, lines = readData('./dataset/simple.aligned')

In [33]:
print (len(b1) == len(b2))
print (lines)

True
167689


In [34]:
tokenize = build_tokenizer()
indices = [index for index, s in enumerate(b1) if len(tokenize(s)) > 15 and len(tokenize(s)) < 20]
print(b1[0])
print(len(indices))
b1 = [b1[i] for i in indices]
b2 = [b2[i] for i in indices]
len(b1)

It is the county seat of Alfalfa County .
27849


27849

In [35]:
vectorizer = CountVectorizer(stop_words='english', min_df = 3)
vectors = vectorizer.fit_transform(b1 + b2)
print(vectors.shape)
vocab = vectorizer.get_feature_names()
len(vocab)
vocab.append(' ') #add empty word for padding

(55698, 20271)


In [36]:
MAXLEN = 19
b1 = [x.lower() for x in b1]
b2 = [x.lower() for x in b2]
b1_tokens = [tokenize(x)[:MAXLEN] for x in b1]
b2_tokens = [tokenize(x)[:MAXLEN] for x in b2]
#padding
b1_tokens = [s + [' '] * (MAXLEN - len(s)) for s in b1_tokens]
b2_tokens = [s + [' '] * (MAXLEN - len(s)) for s in b2_tokens]

In [37]:
ctable = CharacterTable(vocab, MAXLEN, wv)
X = np.zeros((len(b1), MAXLEN, 300), dtype=np.float)
y = np.zeros((len(b1), MAXLEN, len(vocab)), dtype=np.bool)
for i, sentence in enumerate(b1_tokens):
    X[i] = ctable.encode(sentence, maxlen=MAXLEN)

for i, sentence in enumerate(b2_tokens):
    y[i] = ctable.encode_onehot(sentence, maxlen=MAXLEN)

In [38]:
HIDDEN_SIZE = 256
BATCH_SIZE = 100
LAYERS = 4

In [61]:
print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
model.add(LSTM(HIDDEN_SIZE, dropout_W=0.1, dropout_U=0.1, input_shape=(MAXLEN, 300), return_sequences=True))
for _ in range(LAYERS - 2):
    model.add(LSTM(HIDDEN_SIZE, dropout_W=0.1, dropout_U=0.1, return_sequences=True))
model.add(LSTM(HIDDEN_SIZE, dropout_W=0.1, dropout_U=0.1))
# For the decoder's input, we repeat the encoded input for each time step
# We use repeatvector here because we only need the last state, not the whole sequence
model.add(RepeatVector(MAXLEN))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(LAYERS):
    model.add(LSTM(HIDDEN_SIZE, dropout_W=0.1, dropout_U=0.1, return_sequences=True))

# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributedDense(300))
model.add(Activation('tanh'))
model.add(TimeDistributedDense(len(vocab)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adagrad')

Build model...


In [62]:
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=50,
          show_accuracy=True,validation_split = 0.1, shuffle=True)

Train on 25064 samples, validate on 2785 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [63]:
import matplotlib.pyplot as plt
axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['loss'], color='g')
plt.plot(hist.history['loss'], color='g', label='Training Loss')
plt.scatter(hist.epoch, hist.history['val_loss'], color='b')
plt.plot(hist.history['val_loss'], color='b', label='Validation Loss')
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.title('Training Loss & Validation Loss vs Epochs')
plt.legend()

plt.figure(2)

axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['acc'], color='r')
plt.plot(hist.history['acc'], color='r', label='Training Accuracy')
plt.scatter(hist.epoch, hist.history['val_acc'], color='c')
plt.plot(hist.history['val_acc'], color='c', label='Validation Accuracy')
plt.xlabel('epochs')
plt.ylabel('Accuracy')
plt.title('Trainging Accuracy & Validation Accuracy vs Epochs')
plt.legend()

plt.show()

In [55]:
res = model.predict_classes(X[:10])
res_sentences = []
for r in res:
    sent = []
    for i in range(MAXLEN):
        sent.append(ctable.indices_char[r[i]])
    res_sentences.append(sent)

print(len(res_sentences[0]))
res_sentences


19


[[' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',

In [51]:
json_string = model.to_json()
open('my_model_architecture.json', 'w').write(json_string)
model.save_weights('my_model_weights.h5')