Basic RNN implementation. 

note: focus is to lean to use pack_padded_sequence and pad_packed_sequence to fit variant lengths of input into RNN.

followed refererences : 
https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec


## Data Processing

In [1]:
# For reproducibility

import random
import torch

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random_state = random.seed(SEED)
torch.cuda.empty_cache()

device = torch.device("cuda:1" if 
    torch.cuda.is_available() else "cpu")

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm') # <1>


In [2]:
print(torch.cuda.current_device())

torch.cuda.set_device(1) 

0


In [3]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

train_iter, test_iter = IMDB(
    split=('train', 'test')) #<1>

train_dataset = list(train_iter) #<2>
test_dataset  = list(test_iter)

num_train = int(len(train_dataset) * 0.70)
train_data, valid_data = \
    random_split(train_dataset, 
        [num_train, 
         len(train_dataset) - num_train]) # <3>

check data 

In [4]:
print(len(train_data), len(valid_data), len(test_dataset))
# out:17500 7500 25000

data_index = 21
print(train_data[data_index][0])
# out: (your results may vary)
#   pos

print(train_data[data_index][1])
# out: (your results may vary)
# ['This', 'film', 'moved', 'me', 'beyond', 'comprehension', ...



17500 7500 25000
neg
I missed the beginning but I did see most of it. A friend got it on DVD in the cheap room at FYE.<br /><br />The skits are all very short, and yet most of them are still too long. The majority of them, they seem to have forgotten to have something funny! Quite a lot of racist/sexist/"homophobic" humor in it, skits based on stereotypes, or skits which use racist terms for people.<br /><br />I'm trying to remember anything I thought was funny in it, and I'm having trouble.... The logo for the Tunnel Vision network is a lipsticked mouth with an eyeball in it. The mouth opens and closes over the eye like eyelids. Kind of creepy.<br /><br />What a disappointment. Most of the actors went on to better things, and it's lucky this bomb didn't hold them back.


Use pre-trained embedding

In [5]:
from torchtext.vocab import GloVe
embed_len = 200
global_vectors = GloVe(name='6B', dim=embed_len)

In [6]:
embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"), lower_case_backup=True)

embeddings

tensor([[ 0.2661,  0.2182, -0.1100,  ..., -0.1198, -0.1916, -0.1352],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        [ 0.1815,  0.2663,  0.0550,  ...,  0.5375,  0.3151,  0.0162],
        [ 0.0367,  0.1989, -0.0930,  ..., -0.0133, -0.0039,  0.7128],
        [ 0.8540,  0.5715, -0.0237,  ...,  0.3108, -0.2230,  0.2037],
        [ 0.3911,  0.4019, -0.1505,  ..., -0.0348,  0.0798,  0.5031]])

In [7]:
global_vectors.get_vecs_by_tokens("<BOS>")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
# preprocess text 
# 1.lemmonize
# 2.remove stop words 
# 3.remove punctuations 

import spacy
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import nltk
nltk.download("punkt")
import string
import re

lemmatizer = WordNetLemmatizer()
# tokenizer = spacy.load("en_core_web_sm")
stemmer = SnowballStemmer('english')


def remove_punctuation(s):
    # need to remove "br"
    # how to split racist/sexist/"homophobic" ? 
    s = re.sub(r'<br />', ' ', s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub(r'\n', ' ', s)    
    return s 
 
def stem_words(words):
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def lemmatize(words):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

def remove_stopwords(words):
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append(" ")
    words = [word for word in words if word.lower() not in stopwords]
    return words

print(train_data[data_index][1])

words = tokenizer(remove_punctuation(train_data[data_index][1]))
# words = [token for token in doc] # no need
print(words)
words = remove_stopwords(stem_words(lemmatize(words)))
print(words)

def pre_process(text):
    words = tokenizer(remove_punctuation(text))
    # words = [token for token in doc] # no need
    # print(words)
    words = remove_stopwords(stem_words(lemmatize(words)))
    # print(words)
    return words

print(pre_process(train_data[data_index][1]))
#print(stem_words(words))# t1[:40],
#print(lemmatize(stem_words(words)))
#print(remove_stopwords(lemmatize(stem_words(words))))

# exiting issues 
# 1. how to keep wasn't
# 2. how to split racist/sexist/"homophobic" ?

I missed the beginning but I did see most of it. A friend got it on DVD in the cheap room at FYE.<br /><br />The skits are all very short, and yet most of them are still too long. The majority of them, they seem to have forgotten to have something funny! Quite a lot of racist/sexist/"homophobic" humor in it, skits based on stereotypes, or skits which use racist terms for people.<br /><br />I'm trying to remember anything I thought was funny in it, and I'm having trouble.... The logo for the Tunnel Vision network is a lipsticked mouth with an eyeball in it. The mouth opens and closes over the eye like eyelids. Kind of creepy.<br /><br />What a disappointment. Most of the actors went on to better things, and it's lucky this bomb didn't hold them back.
['I', 'missed', 'the', 'beginning', 'but', 'I', 'did', 'see', 'most', 'of', 'it', 'A', 'friend', 'got', 'it', 'on', 'DVD', 'in', 'the', 'cheap', 'room', 'at', 'FYE', ' ', 'The', 'skits', 'are', 'all', 'very', 'short', 'and', 'yet', 'most', 

[nltk_data] Downloading package punkt to
[nltk_data]     /home/lixiaochuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# this is using the frequencies in the data 

# https://coderzcolumn.com/tutorials/artificial-intelligence/how-to-use-glove-embeddings-with-pytorch#Load-Glove-'42B'-Embeddings

from collections import Counter
from torchtext.vocab import vocab
from itertools import chain

# tokenizer(next(train_iter)[1])


# create vocabulatory   
counter = Counter()
for (label, line) in chain(train_iter,valid_data):
    counter.update(pre_process(line))

# be careful, lower case vocab to call function   
vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
vocab.set_default_index(vocab['<unk>'])  # default index for oov words


In [19]:
len(vocab), vocab.lookup_token(21), vocab['were']

(16620, 'enter', 0)

set up embedding for the vocabulatory

In [20]:
# create embedding for dat

# embed_len = 100

glove_embedding_tensor = torch.zeros(len(vocab),  embed_len).to(device)
 
glove_embedding_tensor[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:1')

In [21]:

for i in range(len(vocab)):
    try: 
        glove_embedding_tensor[i] = global_vectors.get_vecs_by_tokens(vocab.lookup_token(i))
    #except KeyError as e:
    #    glove_embedding_tensor[i] = torch.normal(0, 1, size=(embed_len,))
    except KeyError:
        glove_embedding_tensor[i] = torch.normal(0, 1, size=(embed_len,))
   

In [22]:
#text_transform = lambda x: [global_vectors.get_vecs_by_tokens('<BOS>')] + [global_vectors.get_vecs_by_tokens(token) for token in tokenizer(x)] + [global_vectors.get_vecs_by_tokens('<EOS>')]

#t1 = text_transform("how are you?")
 
glove_embedding_tensor.shape,glove_embedding_tensor[1232:1234,]

(torch.Size([16620, 200]),
 tensor([[-0.0492, -0.1199,  0.0567,  0.4816,  0.5375, -0.4658, -0.3448,  0.0903,
          -0.1198,  0.3473, -0.3397,  0.1079,  0.1423, -0.0274,  0.1822,  0.1699,
           0.4808,  0.5828, -0.1633,  0.6667,  0.2206,  2.4306, -0.6263,  0.0826,
          -0.3050, -0.0593,  0.0486,  0.4304, -0.0060,  0.0616,  0.4506, -0.1367,
          -0.4818,  0.3655,  0.0679,  0.2511, -0.0102, -0.4010, -0.2609,  0.0096,
           0.3771, -0.0950, -0.5507,  0.2726, -0.1458, -0.1442,  0.1495,  0.0570,
          -0.2200, -0.1429, -0.4358, -0.0153, -0.4280, -0.0278,  0.4599, -0.2551,
           0.4442,  0.2590, -0.1597,  0.1803,  0.4232,  0.4250, -0.2827,  0.3910,
           0.1220,  0.5496,  0.1232,  0.8240,  0.0724,  0.3928,  0.5629,  0.4256,
           0.3198,  0.2332, -0.2711,  0.2348,  0.0541, -0.2120,  0.2543, -0.2494,
          -0.0307,  0.2610,  0.2042,  0.2186, -0.0366,  0.2053, -0.4181, -0.6570,
           1.3602, -0.5905,  0.1831,  0.0769, -0.1622,  0.0415, -0.2920

Setup data loader

In [23]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
def text_transform(x):
     words = tokenizer(remove_punctuation(x))

     #words = [token.text for token in doc]
     # print(words)
     words = remove_stopwords(stem_words(lemmatize(words)))
     # print(words)
     words =  [vocab['<BOS>']] + [vocab[word] for word in words] + [vocab['<EOS>']]

     return words

label_transform = lambda x: 1 if x == 'pos' else 0
    
def collate_batch(batch):
   label_list, text_list, sent_len = [], [], []
   for (_label, _text) in batch: # each represents one text
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
        sent_len.append(len(text_transform(_text)))
   
   sent_len = torch.tensor(sent_len, dtype=torch.long)
   sent_len, perm_idx = sent_len.sort(0, descending=True)
   text_list  = [text_list[i] for i in perm_idx]
   label_list = [label_list[i] for i in perm_idx]
   return torch.tensor(label_list, dtype=torch.long).to(device), pad_sequence(text_list,  padding_value=1.0).to(device), sent_len.to(device) # not using batch_first=True,
 

In [24]:

BATCH_SIZE = 64
train_dataloader = DataLoader(dataset=train_data, 
                              batch_size=BATCH_SIZE, 
                              shuffle=True , 
                              collate_fn=collate_batch)
 
                  # collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, 
                  batch_size=BATCH_SIZE,
                  shuffle=True, 
                  collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, 
                  batch_size=BATCH_SIZE,
                  shuffle=True, 
                  collate_fn=collate_batch)

In [25]:
trainlabel, trainfeature,sent_len = next(iter(train_dataloader)) 
trainfeature,trainlabel, len(trainfeature), len(trainlabel), trainfeature.shape, sent_len  # len is by row


# 1182 is the longest length of the batch 


(tensor([[   1,    1,    1,  ...,    1,    1,    1],
         [ 121, 1097,   24,  ..., 4266,  686,  160],
         [ 657,   40,  336,  ...,  160,    4,  121],
         ...,
         [   0,    1,    1,  ...,    1,    1,    1],
         [4237,    1,    1,  ...,    1,    1,    1],
         [   2,    1,    1,  ...,    1,    1,    1]], device='cuda:1'),
 tensor([0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
         1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
         1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0], device='cuda:1'),
 499,
 64,
 torch.Size([499, 64]),
 tensor([499, 350, 350, 294, 270, 258, 238, 234, 232, 232, 196, 194, 191, 150,
         146, 143, 137, 126, 121, 120, 114, 110, 108, 105, 105, 104, 103,  99,
          99,  97,  97,  96,  93,  93,  90,  90,  89,  87,  85,  83,  80,  78,
          77,  76,  76,  74,  72,  71,  70,  68,  67,  67,  63,  62,  61,  52,
          49,  42,  39,  38,  31,  29,  29,  28], device='cu

## Model Design

In [26]:
# referene : https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks#2
 
 # simple RNN ,model
from torch import nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_len, hidden_dim, output_dim, n_layers, pad_idx):
        super(RNNClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_len, padding_idx = pad_idx)
        
        
        self.rnn = nn.RNN(input_size=embed_len, 
                       hidden_size=hidden_dim, 
                       num_layers=n_layers, 
                       batch_first=True)            

 
        self.fc = nn.Linear(hidden_dim, output_dim)
         

    def forward(self, text, sent_len):
        embedded = self.embedding(text)

        embedded = embedded.permute(1, 0, 2)# need to put the batch first 
        
        packed_input = pack_padded_sequence(embedded, sent_len.cpu().numpy(), batch_first=True)
        # packed_input.batch_sizes.cpu().numpy().sum()
        # packed_input.data.shape
        packed_output, ht = self.rnn(packed_input)

        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

        return self.fc(ht[-1]) # predictions

In [27]:
vocab_size = len(vocab)
EMBEDDING_DIM = 200
hidden_dim = 50
OUTPUT_DIM = 1
# DROPOUT = 0.5

n_layers=1
PAD_IDX = vocab.lookup_indices(["<PAD>"])[0]

rnn_model = RNNClassifier(vocab_size, EMBEDDING_DIM, hidden_dim, OUTPUT_DIM, n_layers, PAD_IDX)

## Train & Validation

In [28]:
# rnn_model.embedding = torch.nn.Embedding.from_pretrained(glove_embedding_tensor,freeze=False)
rnn_model.embedding.weight.data.copy_(glove_embedding_tensor)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0088, -0.5700,  0.1249,  ..., -0.5161,  0.2882, -0.6986],
        [ 0.0399,  0.2983,  0.2843,  ...,  0.1468,  0.0138, -0.4792],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [30]:
import torch.optim as optim

optimizer = optim.Adam(rnn_model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

rnn_model = rnn_model.to(device)
criterion = criterion.to(device)

In [32]:
for epoch in range(5):
  epoch_loss = 0
  epoch_acc = 0
  
  rnn_model.train()
  for label, text, sent_len in train_dataloader:
      #print(label.dtype)
      #print(text)
      optimizer.zero_grad()
      predictions = rnn_model(text, sent_len).squeeze(1)
      #print(predictions.dtype)
      loss = criterion(predictions, label.float())
      
      rounded_preds = torch.round(
          torch.sigmoid(predictions))
      correct = \
        (rounded_preds == label).float()
      acc = correct.sum() / len(correct)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print("Epoch %d Train: Loss: %.4f Acc: %.4f" %
          (epoch,
          epoch_loss / len(train_dataloader), 
          epoch_acc / len(train_dataloader)))

  epoch_loss = 0
  epoch_acc = 0
  rnn_model.eval()
  # with torch.no_grad():
  #   for label, text  , sent_len in valid_dataloader:
  #     predictions = rnn_model(text, sent_len).squeeze(1)
  #     loss = criterion(predictions, label.float())
      
  #     rounded_preds = torch.round(
  #         torch.sigmoid(predictions))
  #     correct = \
  #       (rounded_preds == label).float()
  #     acc = correct.sum() / len(correct)
      
  #     epoch_loss += loss.item()
  #     epoch_acc += acc.item()

  with torch.no_grad():
      for label, text , sent_len in valid_dataloader:
          predictions = rnn_model(text, sent_len).squeeze(1)
          loss = criterion(predictions, label.float())
          
          rounded_preds = torch.round(
              torch.sigmoid(predictions))
          correct = \
            (rounded_preds == label).float()
          acc = correct.sum() / len(correct)
          
          epoch_loss += loss.item()
          epoch_acc += acc.item()


  print("Epoch %d Valid: Loss: %.4f Acc: %.4f" %
          (epoch,
          epoch_loss / len(valid_dataloader), 
          epoch_acc / len(valid_dataloader)))
  
# out: (your results may vary)
# Epoch 0 Train: Loss: 0.6523 Acc: 0.7165
# Epoch 0 Valid: Loss: 0.5259 Acc: 0.7474
# Epoch 1 Train: Loss: 0.5935 Acc: 0.7765
# Epoch 1 Valid: Loss: 0.4571 Acc: 0.7933
# Epoch 2 Train: Loss: 0.5230 Acc: 0.8257
# Epoch 2 Valid: Loss: 0.4103 Acc: 0.8245
# Epoch 3 Train: Loss: 0.4559 Acc: 0.8598
# Epoch 3 Valid: Loss: 0.3828 Acc: 0.8549
# Epoch 4 Train: Loss: 0.4004 Acc: 0.8813
# Epoch 4 Valid: Loss: 0.3781 Acc: 0.8675

Epoch 0 Train: Loss: 0.4742 Acc: 0.7774
Epoch 0 Valid: Loss: 0.4867 Acc: 0.7968
Epoch 1 Train: Loss: 0.4932 Acc: 0.7638
Epoch 1 Valid: Loss: 0.5382 Acc: 0.7506
Epoch 2 Train: Loss: 0.4133 Acc: 0.8127
Epoch 2 Valid: Loss: 0.5598 Acc: 0.7296
Epoch 3 Train: Loss: 0.3381 Acc: 0.8570
Epoch 3 Valid: Loss: 0.5187 Acc: 0.7713
Epoch 4 Train: Loss: 0.2754 Acc: 0.8909
Epoch 4 Valid: Loss: 0.5480 Acc: 0.7716


# Testing & Deployment

In [33]:
test_loss = 0
test_acc = 0
rnn_model.eval() # <1>
with torch.no_grad(): # <1>
  for label, text, sent_len in test_dataloader:
    predictions = rnn_model(text, sent_len).squeeze(1)
    loss = criterion(predictions, label.float())
    
    rounded_preds = torch.round(
        torch.sigmoid(predictions))
    correct = \
      (rounded_preds == label).float()
    acc = correct.sum() / len(correct)

    test_loss += loss.item()
    test_acc += acc.item()

print("Test: Loss: %.4f Acc: %.4f" %
        (test_loss / len(test_dataloader), 
        test_acc / len(test_dataloader)))
# out: (your results will vary)
#   Test: Loss: 0.3821 Acc: 0.8599

Test: Loss: 0.5590 Acc: 0.7672


In [35]:
import spacy
nlp = spacy.load('en_core_web_sm')


text_pipeline = lambda x: [vocab[token] 
      for token in pre_process(x)] # significantly change results


def predict_sentiment(model, sentence):
    model.eval()
    sent_encoded = text_pipeline(sentence) 
    text = torch.tensor(sent_encoded).unsqueeze(1).to(device)
    prediction = torch.sigmoid(model(text, torch.tensor([len(sent_encoded)])).to(device))
    return prediction.item()

sentiment = predict_sentiment(rnn_model, 
                  "Don't waste your time")
print(sentiment)
# out: 4.763594888613835e-34

sentiment = predict_sentiment(rnn_model, 
                  "You gotta see this movie!")
print(sentiment)
# out: 0.941755473613739

0.23295678198337555
0.8881576061248779


In [None]:

torch.save(rnn_model.state_dict(), 'basic_rnn-model.pt')

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()