In [1]:
from keras import layers
from keras.models import Sequential
import numpy as np
import nltk
from nltk.corpus import reuters
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import get_file
from keras.initializers import Constant
from keras.utils import to_categorical

MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LEN = 500

Using TensorFlow backend.


In [2]:
nltk.download('reuters')
fileids = reuters.fileids()
fileids_test = np.array([fid for fid in fileids if fid.startswith("test")])
fileids_train = np.array([fid for fid in fileids if fid.startswith("train")])

[nltk_data] Downloading package reuters to /root/nltk_data...


In [0]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(reuters.raw(fid) for fid in fileids)
X_train = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_train)
X_test = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_test)

In [0]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LEN)

In [0]:
# get glove coeff matrix
embeddings_index = {}
fname = get_file("glove.6B.100d.txt", "http://nlp.stanford.edu/data/glove.6B.zip"
                 , extract=True)
with open(fname, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
def create_model(encoder_units=256, decoder_units=256, rnn_layer=layers.GRU):
    #embedding = layers.Embedding(num_words, embdedding_dim
    #                      , input_length=MAX_SEQUENCE_LEN
    #                      , embeddings_initializer=Constant(embedding_matrix), trainable=True)
    embedding = layers.Embedding(MAX_NUM_WORDS, 100)
    model = Sequential()
    model.add(embedding)
    model.add(layers.Bidirectional(rnn_layer(encoder_units, return_sequences=False)))
    model.add(layers.RepeatVector(MAX_SEQUENCE_LEN))
    model.add(layers.Bidirectional(rnn_layer(decoder_units, return_sequences=True)))
    model.add(layers.TimeDistributed(layers.Dense(MAX_NUM_WORDS, activation='softmax')))
    model.compile(optimizer='adam'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    return model

In [6]:
model = create_model(rnn_layer=layers.CuDNNGRU)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               549888    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 500, 512)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 500, 512)          1182720   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 500, 10000)        5130000   
Total params: 7,862,608
Trainable params: 7,862,608
Non-trainable params: 0
_________________________________________________________________


In [0]:
def data_generator(X_in, batch_size=32, shuffle=True, repeat=True):
    index = np.arange(X_in.shape[0])
    while True:
        np.random.shuffle(index)
        X = X_in[index]
        n = X.shape[0]//batch_size
        for chunk in np.split(X[:n*batch_size], n):
            yield chunk, to_categorical(chunk, MAX_NUM_WORDS)
        rest = X[n*batch_size:]
        if rest.shape[0]:
            yield rest, to_categorical(rest, MAX_NUM_WORDS)
        if not repeat:
            break

In [0]:
bs = 32
model.fit_generator(data_generator(X_train, batch_size=bs)
                    , steps_per_epoch=X_train.shape[0]//bs
                    , epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [0]:
X_pred = np.argmax(model.predict(X_train[:100], verbose=1), axis=2)
X_pred

In [0]:
X_train[:100]