<a href="https://colab.research.google.com/github/zhaimobile/100-pandas-puzzles/blob/master/LSTM_LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA

In [0]:
!wget http://mattmahoney.net/dc/text8.zip -O text8.gz
!gzip -d text8.gz -f

!head text8 -c 99000000 > text8.train.txt
!tail text8 -c 1000000 > text8.valid.txt

!rm text8

!ls -la .

--2019-12-30 14:25:26--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.gz’


2019-12-30 14:26:10 (696 KB/s) - ‘text8.gz’ saved [31344016/31344016]

total 97676
drwxr-xr-x 1 root root     4096 Dec 30 14:26 .
drwxr-xr-x 1 root root     4096 Dec 30 14:23 ..
drwxr-xr-x 1 root root     4096 Dec 18 16:52 .config
drwxr-xr-x 1 root root     4096 Dec 18 16:52 sample_data
-rw-r--r-- 1 root root 99000000 Dec 30 14:26 text8.train.txt
-rw-r--r-- 1 root root  1000000 Dec 30 14:26 text8.valid.txt


# IMPORTS

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext

import spacy

In [0]:
MAX_VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBEDDING_SIZE = 500

device = torch.device('cuda' if torch.cudnn_is_acceptable else 'cpu')
device

device(type='cuda')

In [0]:
def spacy_tokenize(x):
    return [tok.text for tok in spacy.load('en').tokenizer(x)]
 
TEXT_SPACY = torchtext.data.Field(lower=True, tokenize=spacy_tokenize)

In [0]:
TEXT = torchtext.data.Field(lower=True)

train, val = torchtext.datasets.LanguageModelingDataset.splits(
  path=".", 
  train="text8.train.txt", 
  validation="text8.valid.txt", 
  text_field=TEXT
)

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
VOCAB_SIZE = len(TEXT.vocab)
VOCAB_SIZE

50002

In [0]:
train_iter, val_iter = torchtext.data.BPTTIterator.splits(
  (train, val), 
  batch_size=BATCH_SIZE, 
  device=device, 
  bptt_len=32, 
  repeat=False, 
  shuffle=True
)

In [0]:
print(len(train), len(train_iter))
print("------------------------------")
print(type(train_iter))
print(train_iter.__dict__)
print("------------------------------")
batch = next(iter(train_iter))
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:,2].data]))
print("------------------------------")
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,2].data]))


1 4111
------------------------------
<class 'torchtext.data.iterator.BPTTIterator'>
{'bptt_len': 32, 'batch_size': 128, 'train': True, 'dataset': <torchtext.datasets.language_modeling.LanguageModelingDataset object at 0x7fc1cdb5ac50>, 'batch_size_fn': None, 'iterations': 0, 'repeat': False, 'shuffle': True, 'sort': False, 'sort_within_batch': False, 'sort_key': None, 'device': device(type='cuda'), 'random_shuffler': <torchtext.data.utils.RandomShuffler object at 0x7fc1cdb5a898>, '_iterations_this_epoch': 0, '_random_state_this_epoch': None, '_restored_from_state': False}
------------------------------
settled in the cities which offered jobs education and other opportunities that enabled newcomers to enter the middle class since the one nine three zero s many rural workers have moved to
------------------------------
in the cities which offered jobs education and other opportunities that enabled newcomers to enter the middle class since the one nine three zero s many rural workers hav

# LanguageModel

In [0]:
class LSTM_LM(nn.Module):

  def __init__(self, token_size, emb_size, hidden_size, num_layers, dropout=0.5):
    super(LSTM_LM, self).__init__()

    self.encoder = nn.Embedding(token_size, emb_size) #an Embedding module containing n_token tensors of size n_emb
    self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
    self.decoder = nn.Linear(hidden_size, token_size)

    self.init_weights()

    self.drop = nn.Dropout(dropout)

    self.hidden_size = hidden_size
    self.num_layers = num_layers

  def forward(self, input, hidden):
    emb = self.drop(self.encoder(input))
    output, hidden = self.lstm(emb, hidden)
    output = self.drop(output)
    decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))

    return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

  def init_weights(self):
    initrange = 0.1
    self.encoder.weight.data.uniform_(-initrange, initrange)
    self.decoder.bias.data.zero_()
    self.decoder.weight.data.uniform_(-initrange, initrange)

  
  def init_hidden(self, bsz, requires_grad=True):
    weight = next(self.parameters())
    return( weight.new_zeros((self.num_layers, bsz, self.hidden_size), requires_grad=requires_grad),
            weight.new_zeros((self.num_layers, bsz, self.hidden_size), requires_grad=requires_grad))


In [0]:
model = LSTM_LM(token_size=VOCAB_SIZE, 
                emb_size=EMBEDDING_SIZE, 
                hidden_size=EMBEDDING_SIZE, 
                num_layers=2, 
                dropout=0.5)

model = model.cuda()

In [0]:
loss_fn = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [0]:
GRAD_CLIP = 1.
NUM_EPOCHS = 2

val_losses = []
for epoch in range(NUM_EPOCHS):
  model.train()

  hidden = model.init_hidden(BATCH_SIZE)

  for i, batch in enumerate(iter(train_iter)):
    data, target = batch.text, batch.target
    data, target = data.cuda(), target.cuda()

    hidden = repackage_hidden(hidden)
    model.zero_grad()
    output, hidden = model(data, hidden)
    
    loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
    optimizer.step()
    
    if i % 1000 == 0:
      print("epoch", epoch, "iter", i, "loss", loss.item())

epoch 0 iter 0 loss 10.826722145080566


# USE LM

In [0]:
hidden_x = model.init_hidden(1)
input_x = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
print(TEXT.vocab.itos[input_x])

In [0]:
words = []
for i in range(100):
    output, hidden = model(input_x, hidden_x)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input_x.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

# Display Embedding

In [0]:
model.encoder

In [0]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns

In [0]:
embdata = model.encoder.weight.data.cpu().numpy()

In [0]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(embdata)

In [0]:
pca_result

In [0]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [0]:
vocab = 
plt.figure(figsize=(12, 12), dpi=80)
plt.scatter(pca_result[:, 0], pca_result[:, 1])

for i, word in enumerate(vocab):
	plt.annotate(word, xy=(pca_result[i, 0], pca_result[i, 1]))
 
plt.show()

SyntaxError: ignored

In [0]:
plt.scatter(pca_result[:300, 0], pca_result[:300, 1])

words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))