In [4]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p /content/drive/My\ Drive/nn_output

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
OUTPUTDIR='/content/drive/My Drive/nn_output'

In [6]:
!pip install keras-TCN

from keras.layers import (Bidirectional, Dense, Embedding, Input, Lambda, InputLayer, Reshape
                          , LSTM, RepeatVector, TimeDistributed)
from keras.models import Model, Sequential, load_model
from tcn import TCN
from keras.utils import to_categorical
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from nltk.corpus import reuters
from itertools import chain
import nltk
nltk.download('reuters')
nltk.download('punkt')
from keras.callbacks import ModelCheckpoint
import os.path
import glob

Collecting keras-TCN
  Downloading https://files.pythonhosted.org/packages/f2/bc/dcbdc24d80229022333150f42ff88ddf4c6793568f711a0d6fc1e83b102e/keras_tcn-2.3.5-py2.py3-none-any.whl
Installing collected packages: keras-TCN
Successfully installed keras-TCN-2.3.5


Using TensorFlow backend.


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
MAX_SEQUENCE_LEN = 250
MAX_NUM_WORDS = 10000

kernel_size = 3
n_dilations = 8
n_hidden = 256
embedding_size = 100
dropout=0.4

input_layer = Input(shape=(MAX_SEQUENCE_LEN,))
encoder = Embedding(MAX_NUM_WORDS, embedding_size)(input_layer)
encoder = TCN(name='latent', return_sequences=False,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=n_hidden,
              nb_stacks=1,
              dropout_rate=dropout)(encoder)
decoder = RepeatVector(MAX_SEQUENCE_LEN, name='decoder')(encoder)
decoder = TCN(return_sequences=True,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=n_hidden,
              nb_stacks=1,
              dropout_rate=dropout)(decoder)
output_layer = TimeDistributed(Dense(MAX_NUM_WORDS, activation='softmax'))(decoder)
model = Model(input_layer, output_layer)
optimizer = optimizers.Adam(lr=0.002, clipnorm=0.4)
model.compile(optimizer=optimizer, metrics=['accuracy'], loss='categorical_crossentropy')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 100)     1000000     input_1[0][0]                    
__________________________________________________________________________________________________
latent_initial_conv (Conv1D)    (None, 250, 256)     25856       embedding_1[0][0]                
__________________________________________________________________________________________________
latent_d_causal_conv_1_tanh_s0  (None, 250, 256)     196864      latent_initial_conv[0][0]        
__________________________________________________________________________________________________
activation

In [0]:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

class ReutersGenerator():
    def __init__(self, max_seq_length=250, num_words=5000):
        self.tok = Tokenizer(num_words=num_words)
        self.max_seq_length = max_seq_length
        self.num_words = num_words
    
    def _gen_sents(self, fids):
        return (' '.join(sent) for fid in fids for sent in reuters.sents(fid))
    
    def fit(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        self.tok.fit_on_texts(self._gen_sents(fids))
        return self

    def count(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        return sum(1 for _ in self._gen_sents(fids))
    
    def inverse_transform(self, X):
        return self.tok.sequences_to_texts(X)
    
    def generate_pairs(self, fid_startswith='train', bs=32, 
                         max_seq_len=250, forever=True, shuffle=True):
        fids_in = np.array([fid for fid in reuters.fileids() if fid.startswith(fid_startswith)])
        index = np.arange(fids_in.shape[0])
        while True:
            np.random.shuffle(index)
            fids = fids_in[index]
            sents = self._gen_sents(fids)
            for batch in grouper(sents, bs):
                seqs = self.tok.texts_to_sequences_generator(text for text in batch if text)
                X = pad_sequences(list(seqs), self.max_seq_length)
                yield X, to_categorical(X, self.num_words)
            if not forever:
                break

In [0]:
reuters_gen = ReutersGenerator(
    num_words=MAX_NUM_WORDS, max_seq_length=MAX_SEQUENCE_LEN).fit()
n_train = reuters_gen.count('train')
#n_test = reuters_gen.count('test')

In [0]:
basename = 'seq2seq-TCN-model-small'
outfname = os.path.join(
    OUTPUTDIR,
    basename + '-ep{epoch:02d}.hdf5')
cp = ModelCheckpoint(
    outfname,
    save_best_only=False,
    save_weights_only=False)

In [11]:
TRAIN_MODEL = True
BATCH_SIZE = 16
EPOCHS = 20

if TRAIN_MODEL:
  history = model.fit_generator(reuters_gen.generate_pairs('train', bs=BATCH_SIZE),
      #validation_data=reuters_gen.generate_pairs('test', bs=BATCH_SIZE),
      steps_per_epoch=n_train//BATCH_SIZE,
      #validation_steps=n_test//BATCH_SIZE,
      epochs=EPOCHS, shuffle=True, callbacks=[cp])
else:
  list_of_files = glob.glob(os.path.join(OUTPUTDIR, basename + '*.hdf5'))
  list_of_files = sorted(list_of_files, key=os.path.getctime)
  assert(len(list_of_files) > 0)
  model = load_model(list_of_files[-1])
  print('Loaded model from \'%s\'' % list_of_files[-1])

Loaded model '/content/drive/My Drive/nn_output/seq2seq-TCN-model-small-ep20.hdf5'


In [0]:
X_train, X_train_hat = next(reuters_gen.generate_pairs('train'))

In [13]:
reuters_gen.inverse_transform(np.argmax(model.predict(X_train[:20], verbose=1), axis=2))



['lt year net net net net net net net net net net vs vs vs vs mln mln mln mln vs vs vs mln vs vs vs mln mln in mths',
 'company company 1986 1986 net net vs mln mln mln mln mln mln mln 1986 1986 1986 1986 and mln mln vs vs vs mln mln',
 'says the the the the the the the the the the the the the the to to to to to to to the the the the the to the the the the the the dealers said',
 'the the the the the the the the the to to to to to the the the the the the to to the the the the the the the the to to pct they said',
 'the the the the the the the the the the the to to to to to to to to to to the the the to pct pct pct pct the dealers said',
 'the the the the the the to to billion billion billion billion a the the the the the the the the the of below yen',
 'lt lt lt lt lt lt lt lt lt lt lt lt to to to to to to lt s s s s the the s s s s the to to about 500 mln stg',
 'the said the to the the the and and and and and the texas',
 'he said said said it to to to the the the the the to to to th

In [0]:
reuters_gen.inverse_transform(X_train[:20])

['lt to charge corp said it to charges of to the social security administration and agreed to pay 1 2 mln dlrs in and costs to the u s government',
 'the company also reached agreements in principle for an 8 1 mln dlr settlement of class action law',
 "about 2 9 mln dlrs of the class action settlement will be provided by ' s insurance carrier",
 'the settlement is contingent on court approval after notice to class members it said',
 'the case settlement all charges including and statement except for to which',
 "the settlement includes the lifting of the government ' s suspension the of the federal civil claims suit and all charges against the individuals",
 'of the 2 9 mln dlrs the insurance carrier will provide for the civil settlement 750 000 dlrs will go to settle a lawsuit',
 'for the year ended december 31 reported a net loss of 38 5 mln dlrs',
 'the year end results include an 8 0 mln dlrs provision for future legal and or settlement costs to cover the civil and announced today'

In [0]:
def iter_labels(selection='train'):
    for fid in reuters.fileids():
        if fid.startswith(selection):
            for sent in reuters.sents(fid):
                yield reuters.categories(fid)
labels_train = np.array(list(iter_labels('train')))
labels_test = np.array(list(iter_labels('test')))

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer().fit(labels_train)
y_train = mlb.transform(labels_train)
y_test = mlb.transform(labels_test)

def iter_sents(selection='train'):
    for fid in reuters.fileids():
        if fid.startswith(selection):
            for sent in reuters.sents(fid):
                yield " ".join(sent)
data_train = np.array(list(iter_sents('train')))
data_test = np.array(list(iter_sents('test')))

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(data_train)

X_train = tokenizer.texts_to_sequences(data_train)
X_test = tokenizer.texts_to_sequences(data_test)

X_train = pad_sequences(X_train, MAX_SEQUENCE_LEN)
X_test = pad_sequences(X_test, MAX_SEQUENCE_LEN)

def data_generator(X_in, batch_size=32, shuffle=True, repeat=True):
    index = np.arange(X_in.shape[0])
    while True:
        np.random.shuffle(index)
        X = X_in[index]
        n = X.shape[0]//batch_size
        for chunk in np.split(X[:n*batch_size], n):
            yield chunk, to_categorical(chunk, MAX_NUM_WORDS)
        rest = X[n*batch_size:]
        if rest.shape[0]:
            yield rest, to_categorical(rest, MAX_NUM_WORDS)
        if not repeat:
            break

In [0]:
model_enc = Model(input_layer, encoder)
vecs = model_enc.predict(X_hat[:1000], verbose=True)

In [0]:
from sklearn.manifold import TSNE
vecs_reduced = TSNE().fit_transform(vecs)

In [0]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

categories = [(cat, len(reuters.fileids(categories=cat))) for cat in reuters.categories()]
topn = [cat for cat, _ in sorted(categories, key=lambda x: -x[1])[:10]]

indexes = []
for cat in topn:
    index = []
    for pos, cats in enumerate(labels_train[:1000]):
        if cat in cats:
            index.append(pos)
    indexes.append((cat, index))

for cat, index in indexes:
    plt.scatter(vecs_reduced[index,0], vecs_reduced[index,1], label=cat)
plt.legend(bbox_to_anchor=(1, 1.01))