<a href="https://colab.research.google.com/github/tdiggelm/nn-experiments/blob/master/wikitext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!pip install keras-TCN
import os
import numpy as np
from keras import layers, initializers, models, optimizers

from tcn import TCN

if not os.path.isfile('wikitext-103-v1.zip'):
  !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
  !unzip wikitext-103-v1.zip



In [0]:
MAX_SEQ_LENGTH = 10
MAX_NUM_WORDS = 10000

In [0]:
tok2id = {'<unk>': 0}
id2tok = {0: '<unk>'}

In [0]:
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from itertools import islice

def fit_dict(tokens):
  for tok in tokens:
    n = len(tok2id)
    if not tok in tok2id:
      tok2id[tok] = n
      id2tok[n] = tok

def wikitext_tokens(fname):
  with open(fname) as f:
    for line in f:
      line = line.strip()
      if line and not line.startswith('='):
        line = line.lower()
        tokens = line.split(' ')
        for token in tokens:
          yield token

def transform_token(tok):
  return tok2id[tok] if tok in tok2id else tok2id['<unk>']

def transform_token_seq(toks):
  return [transform_token(tok) for tok in toks]

def one_hot(tok):
  return to_categorical(tok, len(tok2id))
          
def gen_pairs(tokens, min_ngram=2, max_ngram=10):
  tokgen = iter(tokens)
  prev_toks = []
  for _ in range(min_ngram):
      prev_toks.append(next(tokgen))
  while True:
    curr_tok = next(tokgen)
    ngramlen = min(np.random.randint(min_ngram, max_ngram), len(prev_toks))
    yield prev_toks[-ngramlen:] ,curr_tok
    if len(prev_toks) > max_ngram-1:
      prev_toks.pop(0)
    prev_toks.append(curr_tok)
    
def gen_batches(dataset='train', bs=32):
  tokens = wikitext_tokens('wikitext-103/wiki.%s.tokens' % dataset)
  tokens = (transform_token(tok) for tok in tokens)
  pairgen = gen_pairs(tokens)
  while True:
    Xs = []
    ys = []
    for x, y in islice(pairgen, bs):
      Xs.append(x)
      ys.append(y)
    yield (pad_sequences(Xs, MAX_SEQ_LENGTH),
      to_categorical(np.array(ys), MAX_NUM_WORDS))
    

In [0]:
fit_dict(wikitext_tokens('wikitext-103/wiki.train.tokens'))

In [5]:
if not os.path.isfile("glove.6B.100d.txt"):
  !wget "http://nlp.stanford.edu/data/glove.6B.zip"
  !unzip "glove.6B.zip"

# get glove coeff matrix
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = tok2id
num_words = len(word_index)+1 if not MAX_NUM_WORDS else min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if MAX_NUM_WORDS and i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

--2019-01-18 14:28:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-18 14:28:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-18 14:29:23 (20.8 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
Found 400000 word vectors.


In [52]:
embedding = layers.Embedding(num_words, embdedding_dim, 
            input_length=MAX_SEQ_LENGTH, 
            embeddings_initializer=initializers.Constant(embedding_matrix),
            trainable=True)
model = models.Sequential()
model.add(embedding)
model.add(layers.Bidirectional(layers.LSTM(100, return_sequences=False)))
#model.add(layers.Bidirectional(layers.GRU(300, return_sequences=True)))
#model.add(layers.Bidirectional(layers.GRU(300, return_sequences=True)))
#model.add(layers.Lambda(lambda x: x[:, -1, :]))
model.add(layers.Dense(MAX_NUM_WORDS, activation='softmax'))
optimizer = optimizers.SGD(lr=0.001, clipnorm=0.1)
model.compile(optimizer=optimizer,
              metrics=['accuracy'],
              loss='categorical_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 10, 100)           200       
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 200)               160800    
_________________________________________________________________
dense_8 (Dense)              (None, 10000)             2010000   
Total params: 2,171,000
Trainable params: 2,171,000
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit_generator(gen_batches('train'),
                   steps_per_epoch=100, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f97f6a2feb8>

In [49]:
X_test, y_test = next(gen_batches('test'))
X_test

array([[     0,      0,      0, ...,      0,   1141, 115695],
       [     0,      0,      0, ...,   1141, 115695,     24],
       [     0,      0,      0, ..., 115695,     24,    144],
       ...,
       [     0,      0,      0, ...,    472,    129,   3697],
       [     0,      0,      0, ...,    129,   3697,     33],
       [     0,      0,      0, ...,   3697,     33,     25]], dtype=int32)

In [50]:
model.predict_classes(X_test)

array([14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])