In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p /content/drive/My\ Drive/nn_output

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
OUTPUTDIR='/content/drive/My Drive/nn_output'

In [13]:
!pip install keras-TCN

from keras.layers import (Bidirectional, Dense, Embedding, Input, Lambda, InputLayer, Reshape
                          , LSTM, RepeatVector, TimeDistributed)
from keras.models import Model, Sequential, load_model
from tcn import TCN
from keras.utils import to_categorical
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import numpy as np
from nltk.corpus import reuters
from itertools import chain
import nltk
nltk.download('reuters')
nltk.download('punkt')
from keras.callbacks import ModelCheckpoint
import os.path
import glob

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

class ReutersGenerator():
    def __init__(self, max_seq_length=250, num_words=5000):
        self.tok = Tokenizer(num_words=num_words)
        self.max_seq_length = max_seq_length
        self.num_words = num_words
    
    def _gen_sents(self, fids):
        return (' '.join(sent) for fid in fids for sent in reuters.sents(fid))
    
    def fit(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        self.tok.fit_on_texts(self._gen_sents(fids))
        return self

    def count(self, fid_startswith='train'):
        fids = (fid for fid in reuters.fileids() if fid.startswith(fid_startswith))
        return sum(1 for _ in self._gen_sents(fids))
    
    def inverse_transform(self, X):
        return self.tok.sequences_to_texts(X)
    
    def generate_pairs(self, fid_startswith='train', bs=32, 
                         max_seq_len=250, forever=True, shuffle=True):
        fids_in = np.array([fid for fid in reuters.fileids() if fid.startswith(fid_startswith)])
        index = np.arange(fids_in.shape[0])
        while True:
            np.random.shuffle(index)
            fids = fids_in[index]
            sents = self._gen_sents(fids)
            for batch in grouper(sents, bs):
                seqs = self.tok.texts_to_sequences_generator(text for text in batch if text)
                X = pad_sequences(list(seqs), self.max_seq_length)
                yield X, to_categorical(X, self.num_words)
            if not forever:
                break

In [0]:
reuters_gen = ReutersGenerator(
    num_words=MAX_NUM_WORDS, max_seq_length=MAX_SEQUENCE_LEN).fit()
n_train = reuters_gen.count('train')
#n_test = reuters_gen.count('test')

In [15]:
if not os.path.isfile("glove.6B.100d.txt"):
  !wget "http://nlp.stanford.edu/data/glove.6B.zip"
  !unzip "glove.6B.zip"

# get glove coeff matrix
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = reuters_gen.tok.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

--2019-01-15 10:00:38--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-15 10:00:38--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-15 10:02:52 (6.16 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
Found 400000 word vectors.


In [16]:
USE_GLOVE = True
MAX_SEQUENCE_LEN = 250
MAX_NUM_WORDS = 10000

kernel_size = 3
n_dilations = 8
n_hidden = 128
embedding_size = 100
dropout=0.4

input_layer = Input(shape=(MAX_SEQUENCE_LEN,))
encoder = None
if not USE_GLOVE:
  encoder = Embedding(MAX_NUM_WORDS, embedding_size)(input_layer)
else:
  encoder = Embedding(num_words, embdedding_dim, 
                      input_length=MAX_SEQUENCE_LEN, 
                      embeddings_initializer=Constant(embedding_matrix),
                      trainable=True)(input_layer)
encoder = TCN(return_sequences=True,
              kernel_size=kernel_size,
              dilations=[2**n for n in range(n_dilations)],
              nb_filters=n_hidden,
              nb_stacks=1,
              dropout_rate=dropout)(encoder)
output_layer = TimeDistributed(Dense(MAX_NUM_WORDS, activation='softmax'))(encoder)
model = Model(input_layer, output_layer)
optimizer = optimizers.Adam(lr=0.002, clipnorm=0.4)
model.compile(optimizer=optimizer, metrics=['accuracy'], loss='categorical_crossentropy')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 100)     1000100     input_2[0][0]                    
__________________________________________________________________________________________________
tcn_initial_conv (Conv1D)       (None, 250, 128)     12928       embedding_2[0][0]                
__________________________________________________________________________________________________
tcn_d_causal_conv_1_tanh_s0 (Co (None, 250, 128)     49280       tcn_initial_conv[0][0]           
__________________________________________________________________________________________________
activation

In [0]:
basename = 'seq2seq-TCN-model-small-nolatent'
outfname = os.path.join(
    OUTPUTDIR,
    basename + '-ep{epoch:02d}.hdf5')
cp = ModelCheckpoint(
    outfname,
    save_best_only=False,
    save_weights_only=False)

In [0]:
TRAIN_MODEL = True
BATCH_SIZE = 32
EPOCHS = 1

if TRAIN_MODEL:
  history = model.fit_generator(reuters_gen.generate_pairs('train', bs=BATCH_SIZE),
      #validation_data=reuters_gen.generate_pairs('test', bs=BATCH_SIZE),
      steps_per_epoch=n_train//BATCH_SIZE,
      #validation_steps=n_test//BATCH_SIZE,
      epochs=EPOCHS, shuffle=True, callbacks=[cp])
else:
  list_of_files = glob.glob(os.path.join(OUTPUTDIR, basename + '*.hdf5'))
  list_of_files = sorted(list_of_files, key=os.path.getctime)
  assert(len(list_of_files) > 0)
  model = load_model(list_of_files[-1])
  print('Loaded model from \'%s\'' % list_of_files[-1])

Epoch 1/1

In [0]:
X_test, X_test_hat = next(reuters_gen.generate_pairs('test'))

In [11]:
reuters_gen.inverse_transform(np.argmax(model.predict(X_train[:20], verbose=1), axis=2))



['national comparison bank says it cutting base lending rate to 10 5 pct from 11 pct',
 'national comparison bank says it cutting base lending rate to 10 5 pct from 11 pct',
 'borg warner to sell industrial products business for about 240 mln dlrs',
 'borg warner to sell industrial products business for about 240 mln dlrs',
 'louisiana pacific lt to sell forces louisiana pacific corp said it plans to sell its bushel in embassy and 18 000 acres of to construction co',
 'the company said the be embassy in early april',
 'terms were not disclosed',
 "approached ship in oil row heads for consortium a approached research ship by warships and air force km left for the 460 to press 577 ' s case in an freight row with greece over oil rights the semi official resigned news agency said",
 'the ship set off this morning from the port of with sum bushel and embassy by the agency said',
 'prime minister said last night the ship would not go into international waters unless greece did the same',
 'w

In [12]:
reuters_gen.inverse_transform(X_test[:20])

['national westminster bank says it cutting base lending rate to 10 5 pct from 11 pct',
 'national westminster bank says it cutting base lending rate to 10 5 pct from 11 pct',
 'borg warner to sell industrial products business for about 240 mln dlrs',
 'borg warner to sell industrial products business for about 240 mln dlrs',
 'louisiana pacific lt to sell sawmill louisiana pacific corp said it plans to sell its sawmill in oregon and 18 000 acres of to construction co',
 'the company said the be finalized in early april',
 'terms were not disclosed',
 "turkish ship in oil row heads for aegean a turkish research ship by warships and air force planes left for the aegean to press ankara ' s case in an escalating row with greece over oil rights the semi official anatolian news agency said",
 'the ship set off this morning from the port of with flags flying and watched by the agency said',
 'prime minister said last night the ship would not go into international waters unless greece did the

In [0]:
def iter_labels(selection='train'):
    for fid in reuters.fileids():
        if fid.startswith(selection):
            for sent in reuters.sents(fid):
                yield reuters.categories(fid)
labels_train = np.array(list(iter_labels('train')))
labels_test = np.array(list(iter_labels('test')))

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer().fit(labels_train)
y_train = mlb.transform(labels_train)
y_test = mlb.transform(labels_test)

def iter_sents(selection='train'):
    for fid in reuters.fileids():
        if fid.startswith(selection):
            for sent in reuters.sents(fid):
                yield " ".join(sent)
data_train = np.array(list(iter_sents('train')))
data_test = np.array(list(iter_sents('test')))

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(data_train)

X_train = tokenizer.texts_to_sequences(data_train)
X_test = tokenizer.texts_to_sequences(data_test)

X_train = pad_sequences(X_train, MAX_SEQUENCE_LEN)
X_test = pad_sequences(X_test, MAX_SEQUENCE_LEN)

def data_generator(X_in, batch_size=32, shuffle=True, repeat=True):
    index = np.arange(X_in.shape[0])
    while True:
        np.random.shuffle(index)
        X = X_in[index]
        n = X.shape[0]//batch_size
        for chunk in np.split(X[:n*batch_size], n):
            yield chunk, to_categorical(chunk, MAX_NUM_WORDS)
        rest = X[n*batch_size:]
        if rest.shape[0]:
            yield rest, to_categorical(rest, MAX_NUM_WORDS)
        if not repeat:
            break

In [0]:
model_enc = Model(input_layer, encoder)
vecs = model_enc.predict(X_hat[:1000], verbose=True)

In [0]:
from sklearn.manifold import TSNE
vecs_reduced = TSNE().fit_transform(vecs)

In [0]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

categories = [(cat, len(reuters.fileids(categories=cat))) for cat in reuters.categories()]
topn = [cat for cat, _ in sorted(categories, key=lambda x: -x[1])[:10]]

indexes = []
for cat in topn:
    index = []
    for pos, cats in enumerate(labels_train[:1000]):
        if cat in cats:
            index.append(pos)
    indexes.append((cat, index))

for cat, index in indexes:
    plt.scatter(vecs_reduced[index,0], vecs_reduced[index,1], label=cat)
plt.legend(bbox_to_anchor=(1, 1.01))