# text classification step 2: RNN

In [1]:
import pandas as pd

data = pd.read_csv('IMDB Dataset.csv')

In [2]:
from tqdm import tqdm
import nltk

from nltk.stem import WordNetLemmatizer
# nltk.download()
lemmatizer = WordNetLemmatizer()

import re, spacy
nlp = spacy.load('en_core_web_md')

In [3]:
processed_review = []
sentiment = []
word2id = {'<PAD>':0}
vocab = set(['<PAD>'])
count = 1
SEQ_LEN = 100

for i in tqdm(range(len(data))):
    text = data.review[i].lower()
    text = re.sub('<.+?>', '', text)
    text = re.sub('[<>]', '', text)
    text = [lemmatizer.lemmatize(token.text) for token in nlp.tokenizer(text)][:SEQ_LEN]

    tmp = [0] * (SEQ_LEN - len(text)) if len(text) < SEQ_LEN else []
        
    for word in text:
        if word not in vocab:
            vocab.add(word)
            word2id[word] = count
            tmp.append(count)
            count += 1
        else:
            tmp.append(word2id[word])

    processed_review.append(tmp)
    
    if data.sentiment[i] == 'positive':
        sentiment.append(1)
    elif data.sentiment[i] == 'negative':
        sentiment.append(0)
    
print(processed_review[0], sentiment[0])

100%|██████████| 50000/50000 [01:32<00:00, 542.19it/s]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 8, 34, 35, 36, 13, 37, 38, 39, 40, 41, 42, 2, 43, 23, 44, 45, 46, 22, 47, 3, 48, 49, 19, 50, 35, 23, 25, 26, 51, 24, 52, 53, 3, 54, 55, 56, 57, 19, 25, 52, 58, 59, 60, 30, 61, 62, 63, 23, 64, 56, 43, 19, 38, 26, 65, 23, 46, 3, 66, 67, 2, 3, 68, 26, 69, 13, 24] 1





In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_review, sentiment, train_size=0.8, random_state=1988)

In [5]:
from torch.utils.data import TensorDataset, DataLoader
import torch

BATCH_SIZE = 64

train_ds = TensorDataset(torch.as_tensor(X_train), torch.as_tensor(y_train))
test_ds = TensorDataset(torch.as_tensor(X_test), torch.as_tensor(y_test))

train_iter = DataLoader(train_ds, batch_size=BATCH_SIZE, drop_last=True) # (BATCH_SIZE, SEQ_LEN)
test_iter = DataLoader(test_ds, batch_size=BATCH_SIZE, drop_last=True)

# 原始 RNN
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

In [6]:
from torch import nn, optim
from torch.nn import functional as F

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim) # (BATCH_SIZE, SEQ_LEN, EMBED_DIM)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)

        output, hidden = self.rnn(x)
        # output: (BATCH_SIZE, SEQ_LENGTH, HIDDEN_DIM)
        # hidden: (1, BATCH_SIZE, HIDDEN_DIM)
    
        return self.fc(hidden.squeeze(0))

In [7]:
EMBED_DIM = 128
HIDDEN_DIM = 256
rnn = RNN(len(vocab), EMBED_DIM, HIDDEN_DIM)

In [8]:
rnn

RNN(
  (embedding): Embedding(90171, 128)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [9]:
from torch import optim

optimizer = optim.Adam(rnn.parameters())
criterion = nn.BCEWithLogitsLoss()

In [10]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [11]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in iterator:
        optimizer.zero_grad()
        preds = model(text)
        loss = criterion(preds.squeeze(), label.float())
        acc = binary_accuracy(preds.squeeze(), label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for text, label in iterator:
            preds = model(text)
            loss = criterion(preds.squeeze(), label.float())
            acc = binary_accuracy(preds.squeeze(), label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [13]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [14]:
N_EPOCHS = 10

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(rnn, train_iter, optimizer, criterion)
    test_loss, test_acc = evaluate(rnn, test_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(rnn.state_dict(), 'RNN-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 32s
	Train Loss: 0.696 | Train Acc: 52.09%
	 Val. Loss: 0.695 |  Val. Acc: 49.73%
Epoch: 02 | Epoch Time: 1m 26s
	Train Loss: 0.686 | Train Acc: 54.62%
	 Val. Loss: 0.693 |  Val. Acc: 52.33%
Epoch: 03 | Epoch Time: 1m 49s
	Train Loss: 0.680 | Train Acc: 56.83%
	 Val. Loss: 0.743 |  Val. Acc: 51.34%
Epoch: 04 | Epoch Time: 2m 23s
	Train Loss: 0.665 | Train Acc: 59.50%
	 Val. Loss: 0.671 |  Val. Acc: 58.48%
Epoch: 05 | Epoch Time: 1m 55s
	Train Loss: 0.630 | Train Acc: 64.92%
	 Val. Loss: 0.644 |  Val. Acc: 64.52%
Epoch: 06 | Epoch Time: 1m 39s
	Train Loss: 0.603 | Train Acc: 67.90%
	 Val. Loss: 0.653 |  Val. Acc: 59.77%
Epoch: 07 | Epoch Time: 2m 14s
	Train Loss: 0.600 | Train Acc: 67.66%
	 Val. Loss: 0.620 |  Val. Acc: 68.79%
Epoch: 08 | Epoch Time: 1m 14s
	Train Loss: 0.517 | Train Acc: 74.98%
	 Val. Loss: 0.590 |  Val. Acc: 70.31%
Epoch: 09 | Epoch Time: 1m 12s
	Train Loss: 0.556 | Train Acc: 71.42%
	 Val. Loss: 0.667 |  Val. Acc: 60.61%
Epoch: 10 | Epoch T

# 改进 RNN
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb

In [15]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, 
                 bidirectional, dropout):
        super(LSTM, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        self.num_directions = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_dim * self.num_directions, 1)
        
    def forward(self, x):
        embedded = self.dropout(self.embed(x)) # (BATCH_SIZE, SEQ_LEN, EMBED_DIM)

        output, (hidden, cell) = self.lstm(embedded)
        # output: (BATCH_SIZE, SEQ_LENGTH, HIDDEN_DIM)
        # hidden: (n_layers * num_directions, BATCH_SIZE, HIDDEN_DIM)
        # cell: (n_layers * num_directions, BATCH_SIZE, HIDDEN_DIM)
        
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        # hidden: (BATCH_SIZE, HIDDEN_DIM * 2)
        return self.fc(hidden)

In [16]:
EMBED_DIM = 128
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

lstm = LSTM(len(vocab), EMBED_DIM, HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

embeddings = rnn.embedding.weight.data
lstm.embed.weight.data.copy_(embeddings)
lstm.embed.weight.data[0] = torch.zeros(EMBED_DIM)

In [17]:
optimizer = optim.Adam(lstm.parameters())
criterion = nn.BCEWithLogitsLoss()

In [18]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [19]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in iterator:
        optimizer.zero_grad()
        preds = model(text).squeeze(1)
        loss = criterion(preds.squeeze(), label.float())
        acc = binary_accuracy(preds.squeeze(), label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for text, label in iterator:
            preds = model(text).squeeze(1)
            loss = criterion(preds.squeeze(), label.float())
            acc = binary_accuracy(preds.squeeze(), label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [22]:
N_EPOCHS = 10

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(lstm, train_iter, optimizer, criterion)
    test_loss, test_acc = evaluate(lstm, test_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(lstm.state_dict(), 'LSTM-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 6m 57s
	Train Loss: 0.689 | Train Acc: 53.19%
	 Val. Loss: 0.672 |  Val. Acc: 58.28%
Epoch: 02 | Epoch Time: 7m 23s
	Train Loss: 0.687 | Train Acc: 54.54%
	 Val. Loss: 0.689 |  Val. Acc: 53.26%
Epoch: 03 | Epoch Time: 7m 46s
	Train Loss: 0.671 | Train Acc: 57.88%
	 Val. Loss: 0.593 |  Val. Acc: 69.59%
Epoch: 04 | Epoch Time: 7m 39s
	Train Loss: 0.557 | Train Acc: 71.70%
	 Val. Loss: 0.493 |  Val. Acc: 75.67%
Epoch: 05 | Epoch Time: 7m 33s
	Train Loss: 0.470 | Train Acc: 78.16%
	 Val. Loss: 0.440 |  Val. Acc: 79.49%
Epoch: 06 | Epoch Time: 7m 40s
	Train Loss: 0.415 | Train Acc: 81.23%
	 Val. Loss: 0.443 |  Val. Acc: 80.81%
Epoch: 07 | Epoch Time: 7m 35s
	Train Loss: 0.381 | Train Acc: 83.26%
	 Val. Loss: 0.400 |  Val. Acc: 82.31%
Epoch: 08 | Epoch Time: 7m 38s
	Train Loss: 0.349 | Train Acc: 84.81%
	 Val. Loss: 0.404 |  Val. Acc: 82.85%
Epoch: 09 | Epoch Time: 7m 38s
	Train Loss: 0.327 | Train Acc: 85.92%
	 Val. Loss: 0.385 |  Val. Acc: 83.49%
Epoch: 10 | Epoch T