In [1]:
import torch
from torch import nn
from torchtext.datasets import IMDB

In [23]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence

seqs = [[58, 5, 89], [2, 1, 9, 78, 6, 456, 7], [2]]

seqs = sorted([torch.tensor(x) for x in seqs],
              key=lambda elt: elt.size(0), reverse=True,)

padded_seqs = pad_sequence(seqs, batch_first=True)
print(padded_seqs)

tensor([[  2,   1,   9,  78,   6, 456,   7],
        [ 58,   5,  89,   0,   0,   0,   0],
        [  2,   0,   0,   0,   0,   0,   0]])


In [20]:
lengths = [elt.size(0) for elt in seqs]
print(lengths, padded_seqs.size())

[7, 3, 1] torch.Size([3, 7])


In [25]:
packed_seqs = pack_padded_sequence(
    padded_seqs,
    lengths=lengths,
    batch_first=True,
)
print(packed_seqs)

PackedSequence(data=tensor([  2,  58,   2,   1,   5,   9,  89,  78,   6, 456,   7]), batch_sizes=tensor([3, 2, 2, 1, 1, 1, 1]), sorted_indices=tensor([0, 1, 2]), unsorted_indices=tensor([0, 1, 2]))


In [67]:
data_dir = "../data/"

train_data = IMDB(root=data_dir, split="train")
test_set = IMDB(root=data_dir, split="test")

In [68]:
train_data = list(train_data)
test_set = list(test_set)

In [69]:
from torch.utils.data import random_split

torch.manual_seed(0)

train_set, valid_set = random_split(train_data, [20_000, 5_000])

In [70]:
import re


def tokenizer(text: str):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + \
        " ".join(emoticons).replace("-", "")
    tokenized = text.split()

    return tokenized

In [71]:
import torchtext
from collections import Counter, OrderedDict

token_counter = Counter()
for _, line in train_set:
    token_counter.update(tokenizer(line))

print(f"Vocab size is {len(token_counter)}")

# Building the vocabulary
sorted_by_freq_tuple = sorted(
    token_counter.items(),
    key=lambda x: x[1],
    reverse=True,
)
ordered_tokens = OrderedDict(sorted_by_freq_tuple)
vocabulary = torchtext.vocab.vocab(ordered_tokens)

Vocab size is 69241


### Adding special tokens indices

In [72]:
vocabulary.insert_token("<pad>", 0)
vocabulary.insert_token("<unk>", 1)
vocabulary.set_default_index(1)

### Testing vocabulary

In [73]:
eval_sentence = "I'm the leader of project Gutenberg"
print(f"Indices for  are: '{eval_sentence}'")
print([vocabulary[token] for token in tokenizer(eval_sentence)])

Indices for  are: 'I'm the leader of project Gutenberg'
[10, 143, 2, 2178, 5, 1156, 1]


`plt.quiver` prend en entrée quatre matrices, les deux premières représentent les origines des vecteurs et les deux dernières les points terminaux.
Ainsi pour:
```py
# Create some example data for vectors
x = [0, 1, 2, 3]
y = [0, 1, 0, 1]
u = [1, 2, -1, -2]  # x-components of vectors
v = [1, -1, 2, -2]  # y-components of vectors

# Create a figure and axis
fig, ax = plt.subplots()

# Draw arrow vectors
ax.quiver(x, y, u, v, angles='xy', scale_units='xy',
          scale=1, color='blue', label='Vectors')

# Set axis limits
ax.set_xlim(-1, 4)
ax.set_ylim(-2, 3)

# Add labels and legend
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.legend()

# Show the plot
plt.grid()
plt.quiver()
plt.show()
```

Le premier vecteur est composé des points X1 = $(x[0], y[0])$ et X2 = $(u[0], v[0])$

### Utility functions

In [74]:
from typing import Literal, Tuple


def to_seq(text: str):
    r"""
    Encodes a text into its vector representation using a vocabulary.
    """
    return [vocabulary[token] for token in tokenizer(text)]


def label_pipeline(x: Literal["pos", "neg"]):
    r"""
    Returns the numerical representation of an input sentence's label.
    If the label is 'pos' then returns 1, otherwise 0.
    """
    return 1.0 if x == "pos" else 0.0


def collate_batch(batch: list[Tuple[str, str]]):
    label_list, text_list, lengths = [], [], []

    for _label, _sentence in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(to_seq(_sentence), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))

    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list,
        batch_first=True,
    )

    return padded_text_list, label_list, lengths

In [75]:
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

BATCH_SIZE = 32
train_dl = DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch,
)

valid_dl = DataLoader(
    valid_set,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch,
)

test_dl = DataLoader(
    test_set,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch,
)

In [76]:
class CustomRnn(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
    ) -> None:
        super().__init__()

        self.rnn = nn.RNN(
            input_size,
            hidden_size,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x: torch.Tensor):
        _, h = self.rnn(x)
        out = h[-1, :, :]

        return self.fc(out)

In [77]:
model = CustomRnn(input_size=64, hidden_size=32).to(device)
print(model, model(torch.randn(5, 3, 64)))

CustomRnn(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
) tensor([[ 0.1398],
        [-0.4366],
        [ 0.1169],
        [ 0.2668],
        [-0.1811]], grad_fn=<AddmmBackward0>)


In [78]:
class SentimentRnn(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        rnn_hidden_size: int,
        fc_hidden_size: int,
    ) -> None:
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=0,
        )
        self.rnn = nn.LSTM(
            embed_dim,
            rnn_hidden_size,
            batch_first=True,
        )

        self.fc = nn.Sequential(
            nn.Linear(rnn_hidden_size, fc_hidden_size),
            nn.ReLU(),
            nn.Linear(fc_hidden_size, 1),
            nn.Sigmoid(),
        )

    def forward(self, text: torch.Tensor, lengths: torch.Tensor):
        embedded = self.embedding(text)
        packed_out = nn.utils.rnn.pack_padded_sequence(
            embedded,
            lengths.cpu().numpy(),
            enforce_sorted=False,
            batch_first=True,
        )

        _, (h, cell) = self.rnn(packed_out)
        out = h[-1, :, :]
        return self.fc(out)

In [79]:
from tqdm import tqdm


def train(
    net: SentimentRnn,
    loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_fn: nn.Module,
):
    net.train()
    total_acc, total_loss = 0.0, 0.0
    loader_items_count = len(loader.dataset)

    for texts, labels, lengths in tqdm(loader):
        optimizer.zero_grad()
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
        
        preds = model(texts, lengths)[:, 0]
        loss: torch.Tensor = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        current_acc = (
            (
                # preds >= .5 retourne des booléens, .float() transforme les bool en 1. et 0.
                (preds >= 0.5).float()
                == labels
            )
            .float()
            .sum()
            .item()
        )
        total_acc += current_acc

        current_loss = loss.item() * labels.size(0)
        total_loss += current_loss

        print(
            f"Accuracy: {current_acc / len(labels)}, Loss: {current_loss/ len(labels)}"
        )

    return total_acc / loader_items_count, total_loss / loader_items_count

In [80]:
def evaluate(
    net: SentimentRnn,
    loader: DataLoader,
    loss_fn: nn.Module,
):
    net.eval()
    total_acc, total_loss = 0.0, 0.0
    loader_items_count = len(loader.dataset)

    with torch.no_grad():
        for texts, labels, lengths in tqdm(loader):
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            preds = model(texts, lengths)[:, 0]
            loss: torch.Tensor = loss_fn(preds, labels)
            loss.backward()

            current_acc = ((preds >= 0.5).float() == labels).float().sum().item()
            total_acc += current_acc

            current_loss = loss.item() * labels.size(0)
            total_loss += current_loss

            print(
                f"Accuracy: {current_acc / len(labels)}, Loss: {current_loss/ len(labels)}"
            )

    return total_acc / loader_items_count, total_loss / loader_items_count

In [81]:
torch.manual_seed(1)

vocabulary_size = len(vocabulary)
embedding_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

model = SentimentRnn(
    vocab_size=vocabulary_size,
    embed_dim=embedding_dim,
    rnn_hidden_size=rnn_hidden_size,
    fc_hidden_size=fc_hidden_size,
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=1e-3,
)
num_epochs = 1

In [None]:
for epoch in range(num_epochs):
    acc_train, loss_train = train(
        net=model, loader=train_dl, optimizer=optimizer, loss_fn=criterion
    )
    acc_valid, loss_valid = evaluate(net=model, loader=valid_dl, loss_fn=criterion)
    print(
        f"Epoch {epoch} accuracy: {acc_train:.4f}",
        f" val_accuracy: {acc_valid:.4f}",
    )