# Imports


In [1]:
import torchtext
import os
import collections
from tqdm import tqdm

# Setup


In [2]:
data_dir = "../data/"
batch_size = 32
os.makedirs(data_dir, exist_ok=True)
train_dataset = torchtext.datasets.AG_NEWS(
    root=data_dir,
    split="train",
)

test_dataset = torchtext.datasets.AG_NEWS(
    root=data_dir,
    split="test",
)

classes = ["World", "Sports", "Business", "Sci/Tech"]

In [3]:
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)

In [4]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

first_sentence = train_dataset[0][1]
second_sentence = train_dataset[1][1]

f_tokens = tokenizer(first_sentence)
s_tokens = tokenizer(second_sentence)

print(f"First list of tokens:\n{f_tokens}")
print(f"Second list of tokens:\n{s_tokens}")

First list of tokens:
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']
Second list of tokens:
['carlyle', 'looks', 'toward', 'commercial', 'aerospace', '(', 'reuters', ')', 'reuters', '-', 'private', 'investment', 'firm', 'carlyle', 'group', ',', '\\which', 'has', 'a', 'reputation', 'for', 'making', 'well-timed', 'and', 'occasionally\\controversial', 'plays', 'in', 'the', 'defense', 'industry', ',', 'has', 'quietly', 'placed\\its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market', '.']


# Création du vocabulaire


In [5]:
counter = collections.Counter()

for label, sentence in tqdm(train_dataset):
    counter.update(tokenizer(sentence))

vocabulary = torchtext.vocab.vocab(counter, min_freq=1)

100%|██████████| 120000/120000 [00:03<00:00, 32512.34it/s]


In [6]:
vocab_size = len(vocabulary)
print(f"Vocabulary contains {vocab_size} tokens")

Vocabulary contains 95810 tokens


In [7]:
itos = vocabulary.get_itos()
stoi = vocabulary.get_stoi()


def encode(x: str):
    return [stoi[s] for s in tokenizer(x)]


def decode(x: list[int]):
    return [itos[s] for s in x]

In [8]:
vec = encode(first_sentence)
print(vec)
print(decode(vec))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']


In [9]:
from torchtext.data.utils import ngrams_iterator

bi_counter = collections.Counter()

for label, sentence in tqdm(train_dataset):
    bi_counter.update(
        ngrams_iterator(
            tokenizer(sentence),
            ngrams=2,
        )
    )

bi_vocab = torchtext.vocab.vocab(
    bi_counter,
    min_freq=2,
)

print(f"Bi-gram vocab size {len(bi_vocab)}")

  1%|          | 1287/120000 [00:00<00:09, 11876.57it/s]

100%|██████████| 120000/120000 [00:13<00:00, 8907.92it/s]


Bi-gram vocab size 481969


In [10]:
bi_itos = bi_vocab.get_itos()
bi_stoi = bi_vocab.get_stoi()


def bi_encode(phrase: str):
    return [bi_stoi[s] for s in tokenizer(phrase)]


def bi_decode(tokens: list[int]):
    return [bi_itos[s] for s in tokens]

In [11]:
bi_vec = encode(first_sentence)
print(bi_vec)
print(bi_decode(bi_vec))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]
['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'of', 'ultra-cynics', 'are', ',', 'seeing', 'green', 'again', 'wall st', '.']


# Bag of Words


In [12]:
import torch


def to_bow(text: str, bow_vocab_size=vocab_size):
    result = torch.zeros(bow_vocab_size, dtype=torch.float32)

    # retourne une série d'indices [0, 1, 0, ..., 52]
    for idx in encode(text):
        if idx < bow_vocab_size:
            result[idx] += 1  # on compte le nombre d'occurence de chaque indice

    return result

In [13]:
print(encode(first_sentence))
print(to_bow(first_sentence))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 0, 15, 16, 17, 18, 19, 20, 14, 21, 22, 23, 24, 2]
tensor([2., 1., 2.,  ..., 0., 0., 0.])


In PyTorch's DataLoader, the collate_fn is a function that is used to customize the way individual samples are collated (grouped together) into batches when loading data from a dataset. The purpose of the collate_fn is to handle the differences in sizes and shapes of individual samples so that they can be efficiently batched and processed by the neural network.

Here's how it works:

    Batching Process: When you use a DataLoader to load data from a dataset, the DataLoader groups a specified number of individual samples into a batch. Each batch is then passed to your model for processing. Since samples in a batch need to have consistent shapes (e.g., the same sequence length for text data), you might need to perform some preprocessing to ensure that all samples in a batch have the same shape.

    Collate Function (collate_fn): The collate_fn parameter in the DataLoader allows you to specify a custom function that determines how individual samples should be combined into a batch. This function takes a list of individual samples and returns a batch tensor. The collate_fn is called by the DataLoader for each batch creation, and its role is to handle any necessary preprocessing to ensure consistent shapes within a batch.

It can translated as:

1. "Fonction de Regroupement" - This translates to "Grouping Function," which roughly captures the idea that the function is responsible for grouping individual samples into batches.

2. "Fonction de Mise en Lot" - This translates to "Batching Function," which highlights the function's role in creating batches of data.

3. "Fonction de Prétraitement pour Lots" - This translates to "Preprocessing Function for Batches," which emphasizes the preprocessing aspect of the function.


In [14]:
from torch.utils.data import DataLoader


def bowify(batch):
    return (
        torch.LongTensor([t[0] - 1 for t in batch]),  # labels
        torch.stack([to_bow(t[1]) for t in batch]),  # BoW representation
    )


train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=bowify,
    shuffle=True,
)
test_loader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=bowify,
    shuffle=False,
)

In [15]:
from torch import nn

model = nn.Sequential(
    nn.Linear(vocab_size, 4),
    nn.LogSoftmax(dim=1),
)

In [16]:
def train_epoch(
    net: nn.Module,
    loader: DataLoader,
    lr: float = 1e-3,
    optimizer: torch.optim.Optimizer | None = None,
    loss_fn=nn.NLLLoss(),
    epoch_size=None,
    report_freq=200,
):
    r"""
    Args:
        epoch_size: permet d'arrêter le processus au cours de l'epoch actuel si on entrainer le modèle sur un certain nombre d'échantillon
    """
    optimizer = (
        optimizer if optimizer != None else torch.optim.Adam(net.parameters(), lr=lr)
    )
    net.train()
    total_loss, accuracy, count, idx = 0.0, 0.0, 0, 0

    for labels, features in tqdm(loader):
        optimizer.zero_grad()
        output = net(features)
        loss: torch.Tensor = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(output, 1)
        accuracy += (labels == predicted).sum()
        count += len(labels)
        idx += 1

        if idx % report_freq == 0:
            print(f"{count}: accuracy = {round(accuracy.item()/count * 100)}%")

        if epoch_size and count > epoch_size:
            break

    return total_loss, accuracy.item() / count

In [17]:
train_epoch(net=model, loader=train_loader, epoch_size=2000)

  2%|▏         | 125/7500 [00:09<09:17, 13.23it/s]


(151.4331693649292, 0.7023809523809523)

In [18]:
N = 1_000
df = torch.zeros(vocab_size)

for _, doc in tqdm(train_dataset[:N]):
    for idx in set(encode(doc)):
        df[idx] += 1

  6%|▋         | 63/1000 [00:00<00:01, 622.25it/s]

100%|██████████| 1000/1000 [00:01<00:00, 617.70it/s]


In [19]:
def tf_idf(s: str):
    bow = to_bow(s)
    # Laplace smoothing (lissage de Laplace) pour éviter la division par zéro
    return bow * torch.log((N + 1) / (df + 1))

In [20]:
_, max_idx = tf_idf(first_sentence).max(dim=0)
print(vocabulary.get_itos()[encode(first_sentence)[max_idx]]), first_sentence

wall


(None,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

# Embeddings


In [21]:
from torch.nn import functional as F


def padify(batch):
    r"""
    Args:
        batch: est une liste de tuples de la forme (label, feature = text sequence)
    """

    vec = [encode(sample[1]) for sample in batch]
    max_length = max(map(len, vec))  # max length sequence in the mini-batch

    return (
        torch.LongTensor([elt[0] - 1 for elt in batch]),
        torch.stack(
            [
                F.pad(
                    torch.tensor(elt),
                    # padding à droite (à la fin du vecteur)
                    # (0, max_length - len(elt)) => ce tuple indique la taille du padding
                    # On a pour un tenseur 1D, (padding_left, padding_right) le nombre de cases qu'on va ajouter après le padding
                    # dans ce cas 0 à gauche et max_length - len(elt) à droite
                    (0, max_length - len(elt)),
                    mode="constant",
                    value=0,
                )
                for elt in vec
            ]
        ),
    )

In [22]:
train_labels, train_features = padify(train_dataset)
print(train_features, train_features.size())

tensor([[    0,     1,     2,  ...,     0,     0,     0],
        [   25,    26,    27,  ...,     0,     0,     0],
        [   54,    41,    55,  ...,     0,     0,     0],
        ...,
        [36336,   809,  2615,  ...,     0,     0,     0],
        [  924,    16,    17,  ...,     0,     0,     0],
        [ 7836,   892,  6107,  ...,     0,     0,     0]]) torch.Size([120000, 207])


In [23]:
class EmbeddingClassifier(nn.Module):
    def __init__(
        self,
        vocabulary_size: int,
        embed_dim: int,
        num_class: int,
    ) -> None:
        super().__init__()

        self.embedding = nn.Embedding(vocabulary_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, x: torch.Tensor):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        return self.fc(x)

In [24]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=padify,
    shuffle=True,
)
net = EmbeddingClassifier(vocab_size, 32, len(classes))

In [25]:
train_epoch(net, train_loader, lr=1e-3, epoch_size=25000)

  3%|▎         | 205/7500 [00:05<03:18, 36.67it/s]

3200: accuracy = 25%


  5%|▌         | 404/7500 [00:10<03:24, 34.74it/s]

6400: accuracy = 26%


  8%|▊         | 605/7500 [00:15<02:36, 43.95it/s]

9600: accuracy = 25%


 11%|█         | 805/7500 [00:20<02:34, 43.39it/s]

12800: accuracy = 25%


 13%|█▎        | 1006/7500 [00:27<03:11, 33.97it/s]

16000: accuracy = 25%


 16%|█▌        | 1203/7500 [00:35<05:24, 19.41it/s]

19200: accuracy = 25%


 19%|█▊        | 1402/7500 [00:46<05:50, 17.41it/s]

22400: accuracy = 25%


 21%|██        | 1562/7500 [00:56<03:34, 27.74it/s]


(-62843.588962043636, 0.25059980806142035)

In [26]:
class EmbeddingBagClassifier(nn.Module):
    def __init__(self, vocabulary_size: int, embed_dim: int, num_class: int) -> None:
        super().__init__()

        self.embedding = nn.EmbeddingBag(vocabulary_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text, offset):
        x = self.embedding(text, offset)
        return self.fc(x)

In [27]:
def offsetify(batch: tuple[int, str]):
    x = [torch.tensor(encode(sample[1])) for sample in batch]  # encodage des séquences
    # calcule de la taille de chaque vecteur
    offset = [0] + [len(elt) for elt in x]
    # somme cumulée croissante des tailles de vecteur
    offset = torch.tensor(offset[:-1]).cumsum(dim=0)

    return (
        torch.LongTensor(
            [sample[0] - 1 for sample in batch]
        ),  # ajustement des labels du batch
        torch.cat(x),  # concatenation pour avoir un vecteur de taille (1, N)
        offset,  # vecteur des offsets
    )

In [28]:
res = offsetify(
    [
        (1, first_sentence),
        (2, second_sentence),
        (1, first_sentence),
    ]
)

print(res)

(tensor([0, 1, 0]), tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15,
        16, 17, 18, 19, 20, 14, 21, 22, 23, 24,  2, 25, 26, 27, 28, 29,  9, 10,
        11, 10, 12, 30, 31, 32, 25, 33, 14, 34, 35, 36, 37, 38, 39, 40, 41, 42,
        43, 44,  7, 45, 46, 14, 35, 47, 48, 49, 50, 51, 52, 19,  7, 53,  2,  0,
         1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 10, 12, 13, 14,  0, 15, 16,
        17, 18, 19, 20, 14, 21, 22, 23, 24,  2]), tensor([ 0, 29, 71]))


In [29]:
decode(res[1][res[2][1] : res[2][2]])

['carlyle',
 'looks',
 'toward',
 'commercial',
 'aerospace',
 '(',
 'reuters',
 ')',
 'reuters',
 '-',
 'private',
 'investment',
 'firm',
 'carlyle',
 'group',
 ',',
 '\\which',
 'has',
 'a',
 'reputation',
 'for',
 'making',
 'well-timed',
 'and',
 'occasionally\\controversial',
 'plays',
 'in',
 'the',
 'defense',
 'industry',
 ',',
 'has',
 'quietly',
 'placed\\its',
 'bets',
 'on',
 'another',
 'part',
 'of',
 'the',
 'market',
 '.']

In [30]:
res = torch.randn((2, 2))

print(res, res.mean(), res.mean(dim=1))

tensor([[-1.4289,  1.8165],
        [-0.8550, -1.1369]]) tensor(-0.4011) tensor([ 0.1938, -0.9959])


# RNN & LSTM


In [31]:
class RnnClassifier(nn.Module):
    def __init__(
        self,
        vocabulary_size: int,
        embed_dim: int,
        hidden_dim: int,
        num_classes: int,
    ) -> None:
        super().__init__()

        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocabulary_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x: torch.Tensor):
        x = self.embedding(x)
        x, h = self.rnn(x)

        return self.fc(x.mean(dim=1))

In [32]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=padify,
    shuffle=True,
)

rnn_net = RnnClassifier(
    vocab_size,
    embed_dim=64,
    hidden_dim=32,
    num_classes=len(classes),
)
train_epoch(
    rnn_net,
    train_loader,
    lr=1e-3,
    epoch_size=5000,
)

  0%|          | 2/7500 [00:00<22:20,  5.59it/s]

  3%|▎         | 201/7500 [00:21<14:23,  8.45it/s]

3200: accuracy = 24%


  4%|▍         | 312/7500 [00:33<12:49,  9.34it/s]


(-1882.4174821265042, 0.2452076677316294)

In [None]:
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    collate_fn=padify,
    shuffle=False,
)

rnn_net.eval()

with torch.no_grad():
    for batch_idx, (labels, sentences) in enumerate(test_loader):
        word_lookup = [itos[word] for word in sentences[batch_idx]]
        unknown_vals = set("<unk>")
        word_lookup = [elt for elt in word_lookup if elt not in unknown_vals]

        print(f"Input text: {word_lookup}")

        predictions: torch.Tensor = rnn_net(sentences)
        print(torch.argmax(predictions[batch_idx]))

        print(
            f"Actual: value = {labels[batch_idx]}, class = {classes[labels[batch_idx]]}"
        )

        print(
            f"Predicted: value = {predictions[0].argmax(0)}, class = {classes[predictions[0].argmax(0)]}"
        )
        break

In [None]:
import torch
from torch import nn

torch.manual_seed(0)
rnn_layer = nn.RNN(
    input_size=5,
    hidden_size=2,
    num_layers=1,
    batch_first=True,
)

rnn_layer.state_dict()

OrderedDict([('weight_ih_l0',
              tensor([[-0.0053,  0.3793, -0.5820, -0.5204, -0.2723],
                      [ 0.1896, -0.0140,  0.5607, -0.0628,  0.1871]])),
             ('weight_hh_l0',
              tensor([[-0.2137, -0.1390],
                      [-0.6755, -0.4683]])),
             ('bias_ih_l0', tensor([-0.2915,  0.0262])),
             ('bias_hh_l0', tensor([0.2795, 0.4243]))])

In [None]:
w_xh = rnn_layer.weight_ih_l0
b_xh = rnn_layer.bias_ih_l0

w_hh = rnn_layer.weight_hh_l0
b_hh = rnn_layer.bias_hh_l0

print(
    f"""
W_xh shape: {w_xh.shape} \n
B_xh shape: {b_xh.shape} \n
W_hh shape: {w_hh.shape} \n
B_hh shape: {b_hh.shape} \n
"""
)


W_xh shape: torch.Size([2, 5]) 

B_xh shape: torch.Size([2]) 

W_hh shape: torch.Size([2, 2]) 

B_hh shape: torch.Size([2]) 




In [None]:
# input of shape (batch_size, seq_length, num_features=5)
x_seq = torch.tensor([[1.0] * 5, [2.0] * 5, [3.0] * 5]).float()
x_seq.size(), x_seq

(torch.Size([3, 5]),
 tensor([[1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3.]]))

In [None]:
# c'est comme créer un Dataset ayant un seul batch de 3 séquences)
output, hn = rnn_layer(x_seq.reshape(1, 3, 5))
manual_output = []

for t in range(3):
    xt = x_seq[t].reshape(1, 5)
    print(f"Time step {t} => Input:\t{xt.numpy()}")

    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh
    print(f"Hidden: {ht.detach().numpy()}")

    if t > 0:
        prev_h = manual_output[t - 1]
    else:
        prev_h = torch.zeros_like(ht)

    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    manual_output.append(ot)

    print(
        f"""
    Output (manual): {ot.detach().numpy()}\n
    RNN output: {output[:, t].detach().numpy()}\n
    """
    )

Time step 0 => Input:	[[1. 1. 1. 1. 1.]]
Hidden: [[-1.2921703  0.886815 ]]

    Output (manual): [[-0.76684606  0.86455226]]

    RNN output: [[-0.76684606  0.86455226]]

    
Time step 1 => Input:	[[2. 2. 2. 2. 2.]]
Hidden: [[-2.2928548  1.7474363]]

    Output (manual): [[-0.961816   0.9794913]]

    RNN output: [[-0.961816   0.9794913]]

    
Time step 2 => Input:	[[3. 3. 3. 3. 3.]]
Hidden: [[-3.2935395  2.6080575]]

    Output (manual): [[-0.9944769  0.9968337]]

    RNN output: [[-0.9944769  0.9968337]]

    


## Text generation

In [35]:
def char_tokenizer(word: str):
    return list(word)  # equals to [letter for letter in word]

In [36]:
counter = collections.Counter()

for _, sentence in tqdm(train_dataset):
    counter.update(char_tokenizer(sentence))

100%|██████████| 120000/120000 [00:01<00:00, 71780.00it/s]


In [40]:
vocabulary = torchtext.vocab.vocab(counter)
vocabulary_size = len(vocabulary)

print(
    f"Vocab size: {vocabulary_size}\nEncoding of 'a' is {vocabulary['a']}\nCharacter with code 13 is {vocabulary.get_itos()[13]}"
)

Vocab size: 82
Encoding of 'a' is 1
Character with code 13 is c


In [None]:
torch.zeros()