In [1]:
import random
import numpy

def set_seed(seed: int, n_gpu: int):
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

In [2]:
reviews_train = []
for line in open("../data/imdb/full_train.txt", "r"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open("../data/imdb/full_test.txt", "r"):
    reviews_test.append(line.strip())

In [3]:
train_target = [1 if i < 12500 else 0 for i in range(25000)]
test_target = [1 if i < 12500 else 0 for i in range(25000)]

In [4]:
from itertools import chain

train_tokens = list(chain(*[[token for token in sample.lower().split()] for sample in reviews_train]))

In [5]:
from collections import Counter

train_vocabulary = Counter(train_tokens)

In [6]:
UNKNOWN_TOKEN = "unknown"
PAD_TOKEN = "PAD"

index_to_token = [UNKNOWN_TOKEN] + [PAD_TOKEN] + list(train_vocabulary.keys())
token_to_index = {token: index + 2 for index, token in enumerate(train_vocabulary.keys())}
token_to_index[UNKNOWN_TOKEN] = 0
token_to_index[PAD_TOKEN] = 1

In [7]:
EMBEDDING_DIM = 100
BATCH_SIZE = 50
MAX_INPUT_LENGTH = 100

In [8]:
def generate_batch(input_data, max_length = MAX_INPUT_LENGTH):
    
    texts = torch.tensor([padding(sample["text"], max_length) for sample in input_data], dtype=torch.long)
    labels = torch.tensor([sample["label"] for sample in input_data], dtype=torch.long)
    return texts, torch.tensor([]), labels

def padding(text_tokens, max_length, padding_token=1):
    if len(text_tokens) >= max_length:
        return text_tokens[:max_length]
    return text_tokens + [padding_token]*(max_length - len(text_tokens))

In [9]:
prepared_data = []

for label, text in zip(train_target, reviews_train):
    text_tokens = [token_to_index[token.lower()] for token in text.split()]
    prepared_data.append({"label": label, "text": text_tokens, "sequence_length": len(text_tokens)})
    
sorted_prepared_data = sorted(prepared_data, key=lambda x: x["sequence_length"], reverse=True)

In [10]:
from torch.utils.data import DataLoader

def train(input_data, collate_fn, shuffle=True):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, shuffle=shuffle, collate_fn=collate_fn)
    for i, (text, sequence_length, label) in enumerate(data): # 3
        optimizer.zero_grad()
        text, sequence_length, label = text.to(device), sequence_length.to(device), label.to(device)
        output = model(text, sequence_length)
        loss = criterion(output, label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == label).sum().item()

    return train_loss / len(input_data), train_acc / len(input_data)

def test(input_data, collate_fn):
    test_loss = 0
    acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    for i, (text, sequence_length, label) in enumerate(data):
        text, sequence_length, label = text.to(device),  sequence_length.to(device), label.to(device)
        with torch.no_grad():
            output = model(text, sequence_length)
            loss = criterion(output, label)
            test_loss += loss.item()
            acc += (output.argmax(1) == label).sum().item()

    return test_loss / len(input_data), acc / len(input_data)

## Simple RNN

In [11]:
RNN_HIDDEN_SIZE = 100
IS_BIDIRECTIONAL = False
NUM_LAYERS = 1

In [12]:
import torch
import torch.nn as nn

class SentimentClassificationSimpleRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_size, is_bidirectional, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=is_bidirectional)
        self.fc = nn.Linear(hidden_size*(1 + int(is_bidirectional)), num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()

    def forward(self, text, sequence_length=None):
        embedded = self.embedding(text)
        output, _ = self.rnn(embedded)
        pooled_output, _ = output.max(dim=1)
        return self.fc(pooled_output)

In [13]:
set_seed(42, 1)

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = SentimentClassificationSimpleRNNModel(
    vocab_size=len(index_to_token), 
    embed_dim=EMBEDDING_DIM, 
    num_class=len(set(train_target)), 
    hidden_size=RNN_HIDDEN_SIZE,
    is_bidirectional=IS_BIDIRECTIONAL,
    num_layers=NUM_LAYERS
).to(device)

In [15]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

train_len = int(len(prepared_data) * 0.95)
train_data, validation_data = \
    random_split(prepared_data, [train_len, len(prepared_data) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(train_data, generate_batch)
    valid_loss, valid_acc = test(validation_data, generate_batch)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 2 seconds
	Loss: 0.0130(train)	|	Acc: 61.7%(train)
	Loss: 0.0121(valid)	|	Acc: 65.6%(valid)
Epoch: 2  | time in 0 minutes, 2 seconds
	Loss: 0.0113(train)	|	Acc: 70.0%(train)
	Loss: 0.0116(valid)	|	Acc: 69.2%(valid)
Epoch: 3  | time in 0 minutes, 3 seconds
	Loss: 0.0105(train)	|	Acc: 73.5%(train)
	Loss: 0.0101(valid)	|	Acc: 74.2%(valid)
Epoch: 4  | time in 0 minutes, 3 seconds
	Loss: 0.0099(train)	|	Acc: 75.4%(train)
	Loss: 0.0099(valid)	|	Acc: 75.3%(valid)
Epoch: 5  | time in 0 minutes, 3 seconds
	Loss: 0.0095(train)	|	Acc: 77.0%(train)
	Loss: 0.0108(valid)	|	Acc: 72.8%(valid)
Epoch: 6  | time in 0 minutes, 3 seconds
	Loss: 0.0091(train)	|	Acc: 78.3%(train)
	Loss: 0.0096(valid)	|	Acc: 77.2%(valid)
Epoch: 7  | time in 0 minutes, 3 seconds
	Loss: 0.0088(train)	|	Acc: 79.2%(train)
	Loss: 0.0096(valid)	|	Acc: 77.8%(valid)
Epoch: 8  | time in 0 minutes, 3 seconds
	Loss: 0.0085(train)	|	Acc: 80.0%(train)
	Loss: 0.0099(valid)	|	Acc: 77.4%(valid)
Epoch: 9  | time

## Masked Padding

У нас есть два предложения: "I like watching movies" and "I don't like watching movies". После токенизации мы получим: `["I", "like", "watching", "movies"]` (4 токена) и `["I", "don't", "like", "watching", "movies"]` (5 токенов).

Чтобы сформировать батч, оба предложения должны иметь одинаковое количество токенов, поэтому мы добавляем к первому предложению «padding токен» и получаем
`["I", "like", "watching", "movies", "PAD"]` (5 токенов) и `["I", "don't", "like", "watching", "movies"]` (5 токенов)

Теперь наша модель будет знать, что мы использовали padding для первого предложения.
Должна ли наша модель знать об этом?

Мы можем маскировать токены, и наша модель их проигнорирует. Использовать `torch.nn.utils.rnn.pack_padded_sequence`

In [16]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SentimentClassificationMaskedRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_size, is_bidirectional, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=is_bidirectional)
        self.fc = nn.Linear(hidden_size*(1 + int(is_bidirectional)), num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()

    def forward(self, text, sequence_length):
        embedded = self.embedding(text)
        packed_output, _ = self.rnn(pack_padded_sequence(embedded, sequence_length, batch_first=True))
        output, _ = pad_packed_sequence(packed_output)
        pooled_output, _ = output.max(dim=0)
        return self.fc(pooled_output)

In [17]:
from typing import Dict

def padding(text_tokens, max_length, padding_token=1):
    if len(text_tokens) >= max_length:
        return text_tokens[:max_length]
    return text_tokens + [padding_token]*(max_length - len(text_tokens))

def generate_batch_with_masking(input_data: Dict[str, str], max_length = MAX_INPUT_LENGTH):
    # return (text, sequence_length), labels
    max_batch_sequence_length = max(sequence["sequence_length"] for sequence in input_data)
    prepared_data = []
    sequence_length = []
    for sample in input_data:
        prepared_data.append(padding(sample["text"], max_length))
        sequence_length.append(min(max_batch_sequence_length, max_length))
        
    prepared_data = torch.tensor(prepared_data, dtype=torch.long)
    sequence_length = torch.tensor(sequence_length, dtype=torch.long)
    sequence_length, perm_idx = sequence_length.sort(0, descending=True)
    prepared_data = prepared_data[perm_idx]
    
    labels = torch.tensor([sample["label"] for sample in input_data], dtype=torch.long)[perm_idx]
    return prepared_data, sequence_length, labels

In [18]:
generate_batch_with_masking(
    [
        {"text": [1, 2, 3, 4], "label": 1, "sequence_length": 4}, 
        {"text": [1, 5, 2, 3, 4], "label": 0, "sequence_length": 5}, 
    ]
)

(tensor([[1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1],
         [1, 5, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1]]),
 tensor([5, 5]),
 tensor([1, 0]))

In [19]:
set_seed(42, 1)

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = SentimentClassificationMaskedRNNModel(
    vocab_size=len(index_to_token), 
    embed_dim=EMBEDDING_DIM, 
    num_class=len(set(train_target)), 
    hidden_size=RNN_HIDDEN_SIZE,
    is_bidirectional=IS_BIDIRECTIONAL,
    num_layers=NUM_LAYERS
).to(device)

In [21]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

train_len = int(len(prepared_data) * 0.95)
train_data, validation_data = \
    random_split(prepared_data, [train_len, len(prepared_data) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(train_data, generate_batch_with_masking, True)
    valid_loss, valid_acc = test(validation_data, generate_batch_with_masking)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 5 seconds
	Loss: 0.0129(train)	|	Acc: 62.9%(train)
	Loss: 0.0120(valid)	|	Acc: 65.5%(valid)
Epoch: 2  | time in 0 minutes, 5 seconds
	Loss: 0.0108(train)	|	Acc: 72.3%(train)
	Loss: 0.0117(valid)	|	Acc: 69.5%(valid)
Epoch: 3  | time in 0 minutes, 4 seconds
	Loss: 0.0098(train)	|	Acc: 75.9%(train)
	Loss: 0.0099(valid)	|	Acc: 75.4%(valid)
Epoch: 4  | time in 0 minutes, 5 seconds
	Loss: 0.0090(train)	|	Acc: 78.7%(train)
	Loss: 0.0106(valid)	|	Acc: 73.6%(valid)
Epoch: 5  | time in 0 minutes, 5 seconds
	Loss: 0.0084(train)	|	Acc: 80.4%(train)
	Loss: 0.0122(valid)	|	Acc: 70.9%(valid)
Epoch: 6  | time in 0 minutes, 5 seconds
	Loss: 0.0078(train)	|	Acc: 82.1%(train)
	Loss: 0.0096(valid)	|	Acc: 76.2%(valid)
Epoch: 7  | time in 0 minutes, 5 seconds
	Loss: 0.0074(train)	|	Acc: 83.1%(train)
	Loss: 0.0094(valid)	|	Acc: 77.4%(valid)
Epoch: 8  | time in 0 minutes, 5 seconds
	Loss: 0.0068(train)	|	Acc: 85.0%(train)
	Loss: 0.0127(valid)	|	Acc: 69.3%(valid)
Epoch: 9  | time