<a href="https://colab.research.google.com/github/vvvu/potential-chainsaw/blob/main/pytorch-tutorial/%5BIntermediate%5D_Language_Model_RNN_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
Data Utils
1. 主要针对文本进行一些预处理
2. 构建语料库 Corpus
'''

import torch
import os

class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0

  def add_word(self, word):
    if not word in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1

  def __len__(self):
    return len(self.word2idx)

class Corpus(object):
  def __init__(self):
    self.dictionary = Dictionary()

  def get_data(self, path, batch_size = 20):
    # Add words to the dictionary
    with open(path, 'r') as f:
      tokens = 0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words:
          self.dictionary.add_word(word)

    # Tokenize the file content
    ids = torch.LongTensor(tokens)
    token = 0
    with open(path, 'r') as f:
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          ids[token] = self.dictionary.word2idx[word]
          token += 1
    
    num_batches = ids.size(0) // batch_size
    ids = ids[:num_batches * batch_size]
    return ids.view(batch_size, -1)

In [3]:
'''
main.py
'''
import torch
import torch.nn as nn
import numpy as np
'''
clip_grad_norm_
- 梯度裁剪原理：既然在BP过程中会产生（梯度爆炸/梯度消失）问题，那么最简单的方法就是设定Threshold，当
Gradient小于/大于我们设定的Threshold时，我们就将更梯度更新为Theshold即可
'''
from torch.nn.utils import clip_grad_norm_

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000 # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

In [7]:
# Load "Penn Treebank" dataset
corpus = Corpus()
ids = corpus.get_data('sample_data/train.txt',
                      batch_size)
vocab_size = len(corpus.dictionary) # Vocabulary Size
num_batches = ids.size(1) // seq_length

In [8]:
# RNN based language model
class RNNLM(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(RNNLM, self).__init__()
    self.embed = nn.Embedding(vocab_size, embed_size)
    '''
    torch.nn.Embedding
    - A simple lookup table that stores embeddings of a fixed dictionary and size
    This module is often used to store word embeddings and retrieve them using
    indices. The input to the module is a list of indices, and the output is the
    corresponding word embeddings.

    Parameters:
    - num_embeddings - size of the dictionary of embeddings
    - embedding_dim - the size of each embedding vector
    ...

    Usage: Natural Language Processing
    -> 对于一个单词，我们可以只定义它的属性维度，比如100.然后通过NN去学习它每一个属性的大小，
    而我们并不需要关心这个属性到底代表着什么。我们只需要知道：词向量的夹角越小，表示他们之间的语义
    更加接近
    '''
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first = True)
    self.linear = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, h):
    # Embed word ids to vectors
    x = self.embed(x)

    # Forward propagate LSTM
    out, (h, c) = self.lstm(x, h)

    # Reshape output to (batch_size * sequence_length, hidden_size)
    out = out.reshape(out.size(0) * out.size(1), out.size(2))

    # Decode hidden states of all time steps
    out = self.linear(out)
    return out, (h, c)

model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)

In [9]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [10]:
# Truncated backpropagation
def detach(states):
  return [state.detach() for state in states]

In [11]:
# Train the model
for epoch in range(num_epochs):
  # Set initial hidden and cell states
  states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
            torch.zeros(num_layers, batch_size, hidden_size).to(device))
  
  for i in range(0, ids.size(1) - seq_length, seq_length):
    # Get mini-batch inputs and targets
    inputs = ids[:, i:i+seq_length].to(device)
    targets = ids[:, (i+1):(i+1)+seq_length].to(device)

    # Forward pass
    states = detach(states)
    outputs, states = model(inputs, states)
    loss = criterion(outputs, targets.reshape(-1))

    # Backward and optimizer
    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(model.parameters(), 0.5)
    '''
    Clips gradient norm of an iterable of parameters
    - The norm is computed over all gradients together, as if they were
    concatenated into a single vector. Gradients are modified in-place

    Parameters
    - parameters - an iterable of Tensors or a single Tensor that will have 
    gradients normalized
    - max_norm - max norm of the gradients
    - norm_type - type of the used p-norm. Can be 'inf' for infinity norm.
    '''
    optimizer.step()

    step = (i + 1) // seq_length
    if step % 100 == 0:
      print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
            .format(epoch + 1, num_epochs, step, num_batches, loss.item(),
                    np.exp(loss.item())))

Epoch [1/5], Step[0/1549], Loss: 9.2085, Perplexity: 9981.33
Epoch [1/5], Step[100/1549], Loss: 6.0440, Perplexity: 421.56
Epoch [1/5], Step[200/1549], Loss: 5.9201, Perplexity: 372.44
Epoch [1/5], Step[300/1549], Loss: 5.7692, Perplexity: 320.30
Epoch [1/5], Step[400/1549], Loss: 5.6709, Perplexity: 290.30
Epoch [1/5], Step[500/1549], Loss: 5.1149, Perplexity: 166.48
Epoch [1/5], Step[600/1549], Loss: 5.1955, Perplexity: 180.46
Epoch [1/5], Step[700/1549], Loss: 5.3493, Perplexity: 210.45
Epoch [1/5], Step[800/1549], Loss: 5.1861, Perplexity: 178.78
Epoch [1/5], Step[900/1549], Loss: 5.0831, Perplexity: 161.28
Epoch [1/5], Step[1000/1549], Loss: 5.1177, Perplexity: 166.95
Epoch [1/5], Step[1100/1549], Loss: 5.3704, Perplexity: 214.95
Epoch [1/5], Step[1200/1549], Loss: 5.1897, Perplexity: 179.41
Epoch [1/5], Step[1300/1549], Loss: 5.1027, Perplexity: 164.47
Epoch [1/5], Step[1400/1549], Loss: 4.8364, Perplexity: 126.02
Epoch [1/5], Step[1500/1549], Loss: 5.1366, Perplexity: 170.14
Epo

In [12]:
# Test the model
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        # Set intial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))

        # Select one word id randomly
        prob = torch.ones(vocab_size)
        input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

        for i in range(num_samples):
            # Forward propagate RNN 
            output, state = model(input, state)

            # Sample a word id
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()

            # Fill input with sampled word id for the next time step
            input.fill_(word_id)

            # File write
            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))

Sampled [100/1000] words and save to sample.txt
Sampled [200/1000] words and save to sample.txt
Sampled [300/1000] words and save to sample.txt
Sampled [400/1000] words and save to sample.txt
Sampled [500/1000] words and save to sample.txt
Sampled [600/1000] words and save to sample.txt
Sampled [700/1000] words and save to sample.txt
Sampled [800/1000] words and save to sample.txt
Sampled [900/1000] words and save to sample.txt
Sampled [1000/1000] words and save to sample.txt


In [13]:
# Save the model checkpoints
torch.save(model.state_dict(), 'model.ckpt')