In [1]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p /content/drive/My\ Drive/nn_output

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
OUTPUTDIR='/content/drive/My Drive/nn_output'

In [36]:
!pip install keras-TCN

from keras.layers import (Bidirectional, Dense, Embedding, Input, Lambda, InputLayer, Reshape
                          , LSTM, RepeatVector, TimeDistributed, Flatten)
from keras.models import Model, Sequential, load_model
from tcn import TCN
from keras.utils import to_categorical
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import numpy as np
from nltk.corpus import reuters
from itertools import chain
import nltk
nltk.download('reuters')
nltk.download('punkt')
from keras.callbacks import ModelCheckpoint
import os.path
import glob

USE_GLOVE = True
MAX_SEQUENCE_LEN = 100
MAX_NUM_WORDS = 10000

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

class ReutersGenerator():
    def __init__(self, max_seq_length=250, num_words=5000):
        self.tok = Tokenizer(num_words=num_words)
        self.max_seq_length = max_seq_length
        self.num_words = num_words
    
    def _gen_sents(self, fids):
        return (' '.join(sent) for fid in fids for sent in reuters.sents(fid))
    
    def fit(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        self.tok.fit_on_texts(self._gen_sents(fids))
        return self

    def count(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        return sum(1 for _ in self._gen_sents(fids))
    
    def inverse_transform(self, X):
        return self.tok.sequences_to_texts(X)
    
    def generate_pairs(self, fid_startswith='train', bs=32, 
                         max_seq_len=250, forever=True, shuffle=True):
        fids_in = np.array([fid for fid in reuters.fileids() if fid.startswith(fid_startswith)])
        index = np.arange(fids_in.shape[0])
        while True:
            np.random.shuffle(index)
            fids = fids_in[index]
            sents = self._gen_sents(fids)
            for batch in grouper(sents, bs):
                seqs = self.tok.texts_to_sequences_generator(text for text in batch if text)
                X = pad_sequences(list(seqs), self.max_seq_length)
                yield X, to_categorical(X, self.num_words)
            if not forever:
                break

In [0]:
reuters_gen = ReutersGenerator(
    num_words=MAX_NUM_WORDS, max_seq_length=MAX_SEQUENCE_LEN).fit()
n_train = reuters_gen.count('train')
#n_test = reuters_gen.count('test')

In [6]:
if not os.path.isfile("glove.6B.100d.txt"):
  !wget "http://nlp.stanford.edu/data/glove.6B.zip"
  !unzip "glove.6B.zip"

# get glove coeff matrix
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = reuters_gen.tok.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

--2019-01-15 10:28:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-15 10:28:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-15 10:29:30 (25.2 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
Found 400000 word vectors.


In [54]:
kernel_size = 3
n_dilations = 8
n_hidden = 256
embedding_size = 100
dropout=0.4

input_layer = Input(shape=(MAX_SEQUENCE_LEN,))
encoder = None
if not USE_GLOVE:
  encoder = Embedding(MAX_NUM_WORDS, embedding_size)(input_layer)
else:
  encoder = Embedding(num_words, embdedding_dim, 
                      input_length=MAX_SEQUENCE_LEN, 
                      embeddings_initializer=Constant(embedding_matrix),
                      trainable=True)(input_layer)
encoder = TCN(return_sequences=True,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=n_hidden,
              nb_stacks=1,
              dropout_rate=dropout)(encoder)
latent = TCN(name='latent', return_sequences=False,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=100,
              nb_stacks=1,
              dropout_rate=dropout)(encoder)
decoder = RepeatVector(MAX_SEQUENCE_LEN)(latent)
decoder = TCN(name='dec', return_sequences=True,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=n_hidden,
              nb_stacks=1,
              dropout_rate=dropout)(decoder)
output_layer = TimeDistributed(Dense(MAX_NUM_WORDS, activation='softmax'))(decoder)
model = Model(input_layer, output_layer)
optimizer = optimizers.Adam(lr=0.002, clipnorm=0.4)
model.compile(optimizer=optimizer, metrics=['accuracy'], loss='categorical_crossentropy')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 100, 100)     1000100     input_14[0][0]                   
__________________________________________________________________________________________________
tcn_initial_conv (Conv1D)       (None, 100, 256)     25856       embedding_14[0][0]               
__________________________________________________________________________________________________
tcn_d_causal_conv_1_tanh_s0 (Co (None, 100, 256)     196864      tcn_initial_conv[0][0]           
__________________________________________________________________________________________________
activation

In [0]:
basename = 'seq2seq-TCN-model-small-nolatent'
outfname = os.path.join(
    OUTPUTDIR,
    basename + '-ep{epoch:02d}.hdf5')
cp = ModelCheckpoint(
    outfname,
    save_best_only=False,
    save_weights_only=False)

In [0]:
TRAIN_MODEL = True
BATCH_SIZE = 32
EPOCHS = 20

if TRAIN_MODEL:
  history = model.fit_generator(reuters_gen.generate_pairs('train', bs=BATCH_SIZE),
      #validation_data=reuters_gen.generate_pairs('test', bs=BATCH_SIZE),
      steps_per_epoch=n_train//BATCH_SIZE,
      #validation_steps=n_test//BATCH_SIZE,
      epochs=EPOCHS, shuffle=True, callbacks=[cp])
else:
  list_of_files = glob.glob(os.path.join(OUTPUTDIR, basename + '*.hdf5'))
  list_of_files = sorted(list_of_files, key=os.path.getctime)
  assert(len(list_of_files) > 0)
  model = load_model(list_of_files[-1])
  print('Loaded model from \'%s\'' % list_of_files[-1])

Epoch 1/20
 133/1258 [==>...........................] - ETA: 6:15 - loss: 1.3862 - acc: 0.7679

In [0]:
X_test, X_test_hat = next(reuters_gen.generate_pairs('test'))

In [62]:
reuters_gen.inverse_transform(np.argmax(model.predict(X_test[:10], verbose=1), axis=2))



['s s in in in in in in in in in in in in in the in the rose rose to to to pct pct pct pct in in in in in pct pct pct in in in pct pct pct pct pct in the the the figures show',
 'exports rose rose rose to to billion billion billion in in in in in to billion billion in in in the the of 1 billion billion',
 'the february discount fell rose rose to marks marks 1 billion billion',
 'the the of rose to to mln mln mln billion in in in the the and and and mln mln mln mln mln mln mln in in 1986',
 'in in pct pct pct pct pct pct pct pct pct pct pct the pct pct pct pct pct pct pct pct pct in the the in to to to 2 pct pct pct pct pct pct pct pct pct pct pct pct pct pct the bureau show',
 'exports exports rose rose to billion billion billion billion pct in in and and and billion billion billion billion billion pct in january',
 'bank bank said bank s had rate to to to 0 billion pct',
 'bank bank said bank s had rate to to to 0 billion pct',
 'the said the the the the the the the the the said said 

In [63]:
reuters_gen.inverse_transform(X_test[:10])

['south african m 3 april growth revised upward south african year on year broadly defined m 3 money supply growth was revised upward to 10 37 pct for april from a preliminary 10 08 pct but was down from a revised 10 69 pct in march reserve bank figures show',
 "m 3 rose to a revised 82 38 billion rand in april from a preliminary 82 17 billion and march ' s revised 81 39 billion",
 'in april last year m 3 stood at 74 64 billion rand',
 'preliminary figures for may show m 3 at 83 24 billion rand for a year on year rise of 10 97 pct from 75 01 billion in may 1986',
 'april m rose a year on year 15 12 pct to 14 22 billion rand after rising 14 72 pct to 13 97 billion in march while m1 rose 24 49 pct to 27 92 billion after a 20 69 pct increase to 26 97 billion the figures showed',
 'm 2 rose 8 28 pct to 58 71 billion in april after rising 6 47 pct to 57 52 billion in march',
 'bank of france says it leaves intervention rate unchanged at 7 1 2 pct',
 'bank of france says it leaves interventi