# Imports

In [1]:
import torch
import numpy as np
import torch.nn as nn

import matplotlib.pyplot as plt

# Model

## Sizes



- x : (batch_size, max_length) 

- tokens_id in x between 0 and vocab_size

- Embedd(x) : (batch_size, max_length, model_dim)

- K : (model_dim, dk)
- Kx : (batch_size, max_length, dk)

- Q : (model_dim, dk)
- Qx : (batch_size, max_length, dk)

- Qx*Kx^T : (batch_size, max_length, max_length)
- V : (model_dim, dv)
- Vx : (batch_size, max_length, dv)





## Implementation

In [120]:
import numpy as np
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_length):
        super().__init__()
        self.d_model = d_model
        self.max_length = max_length
        self.compute()

    def SinPos(self, i: int, pos: int):
        if i % 2 == 0:
            return np.sin(pos / 10000 ** (2 * i / self.d_model))
        else:
            return np.cos(pos / 10000 ** (2 * i / self.d_model))

    def compute(self):
        self.Mat = torch.Tensor([[self.SinPos(i, pos) for i in range(self.d_model)] for pos in range(self.max_length)])


    def forward(self, x):
        return self.Mat[:x.shape[-1], :]


class Embedding(nn.Module):
    def __init__(self, d_model, max_length, n_embedding, dropout):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model
        self.n_embedding = n_embedding
        self.embedding = nn.Embedding(num_embeddings=n_embedding, embedding_dim=d_model)
        self.pos_encoding = PositionalEncoding(d_model=d_model, max_length=max_length)
        self.dropout = nn.Dropout(p=dropout)
        pass

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoding(x)
        return self.dropout(x)


class SingleHeadAttention(nn.Module):
    def __init__(self, dk: int, dv: int, d_model: int):
        super().__init__()
        self.dk = dk
        self.dv = dv
        self.d_model = d_model
        self.K = nn.Linear(in_features=d_model, out_features=dk)
        self.Q = nn.Linear(in_features=d_model, out_features=dk)
        self.V = nn.Linear(in_features=d_model, out_features=dv)

    def forward(self, x: torch.Tensor, x_encoder: torch.Tensor = None, mask=None):
        Kx = self.K(x_encoder) if x_encoder is not None else self.K(x)
        Vx = self.V(x_encoder) if x_encoder is not None else self.V(x)
        Qx = self.Q(x)
        QK = torch.matmul(Qx, Kx.transpose(-2, -1)) / np.sqrt(self.dk)
        if mask is not None:
            QK = QK + mask
        QK = torch.softmax(QK, dim=-1)
        return torch.matmul(QK, Vx)


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads: int, dk: int, dv: int, d_model: int):
        super().__init__()
        assert num_heads * dv == d_model, "num_heads * dv should be equal to the model dim"
        self.attention_heads = nn.ModuleList([SingleHeadAttention(dk=dk, dv=dv, d_model=d_model) for _ in range(num_heads)])
        self.WO = nn.Linear(in_features=num_heads * dv, out_features=d_model)

    def forward(self, x: torch.Tensor, x_encoder: torch.Tensor = None, mask=None):
        outputs = [head(x, x_encoder, mask) for head in self.attention_heads]
        x = torch.cat(outputs, dim=-1)
        x = self.WO(x)
        return x
    

class SublayerConnection(nn.Module): #Dropout, Add and Norm
    def __init__(self, features, dropout, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.dropout = nn.Dropout(p=dropout)

        self.eps = eps

    def forward(self, x, sublayer_output):
        x = x + self.dropout(sublayer_output) #dropout and add
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        x = self.a_2 * (x - mean) / (std + self.eps) + self.b_2 #norm
        return x



class EncoderBlock(nn.Module):
    def __init__(self, num_heads, dk, dv, d_ff, d_model, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, dk=dk, dv=dv, d_model=d_model)
        self.sublayer1 = SublayerConnection(features=d_model, dropout=dropout)
        self.sublayer2 = SublayerConnection(features=d_model, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_ff),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(in_features=d_ff, out_features=d_model),
        )

    def forward(self, x, mask=None):
        attention = self.attention(x=x, mask=mask)
        x = self.sublayer1(x, attention)
        feedforward = self.ff(x)
        x = self.sublayer2(x, feedforward)
        return x


class Encoder(nn.Module):
    def __init__(self, num_heads, dk, dv, d_ff, d_model, dropout, num_encoders):
        super().__init__()
        self.encoders_list = [EncoderBlock(num_heads=num_heads, dk=dk, dv=dv, d_ff=d_ff, d_model=d_model, dropout=dropout) for _ in range(num_encoders)]
        self.encoders = nn.ModuleList(self.encoders_list)

    def forward(self, x, mask=None):
        for encoder in self.encoders_list:
            x = encoder(x, mask)
        return x


class DecoderBlock(nn.Module):
    def __init__(self, num_heads, dk, dv, d_ff, d_model, dropout):
        super().__init__()
        self.masked_attention = MultiHeadAttention(num_heads=num_heads, dk=dk, dv=dv, d_model=d_model)
        self.mixed_attention = MultiHeadAttention(num_heads=num_heads, dk=dk, dv=dv, d_model=d_model)
        self.sublayer1 = SublayerConnection(features=d_model, dropout=dropout)
        self.sublayer2 = SublayerConnection(features=d_model, dropout=dropout)
        self.sublayer3 = SublayerConnection(features=d_model, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_ff),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(in_features=d_ff, out_features=d_model),
        )

    def forward(self, x, x_encoder, causal_mask=None, mixed_mask=None):
        attention = self.masked_attention(x, mask=causal_mask)
        x = self.sublayer1(x, attention)
        attention = self.mixed_attention(x, x_encoder=x_encoder, mask=mixed_mask)
        x = self.sublayer2(x, attention)
        feedforward = self.ff(x)
        x = self.sublayer3(x, feedforward)
        return x

class Decoder(nn.Module):
    def __init__(self, num_heads, dk, dv, d_ff, d_model, dropout, num_decoders):
        super().__init__()
        decoders_list = [DecoderBlock(num_heads=num_heads, dk=dk, dv=dv, d_ff=d_ff, d_model=d_model, dropout=dropout) for _ in range(num_decoders)]
        self.decoders = nn.ModuleList(decoders_list)

    def forward(self, x, x_encoder, causal_mask=None, mixed_mask=None):
        for decoder in self.decoders:
            x = decoder(x, x_encoder, causal_mask, mixed_mask)
        return x


class Transformer(nn.Module):
    def __init__(self, d_model, max_length, vocab_size, num_out, num_heads, dv, dk, d_ff, dropout, num_encoders, num_decoders):
        super().__init__()
        self.embedding = Embedding(d_model, max_length, n_embedding=vocab_size, dropout=dropout)
        self.encoder = Encoder(num_heads, dk, dv, d_ff, d_model, dropout, num_encoders)
        self.decoder = Decoder(num_heads, dk, dv, d_ff, d_model, dropout, num_decoders)
        self.linear = nn.Linear(in_features=d_model, out_features=num_out)
        self.ff_mask = torch.zeros(max_length, max_length) + torch.triu(torch.full((max_length, max_length), float("-inf")), diagonal=1)

    def forward(self, input, output):
        input_embed = self.embedding(input)
        output_embed = self.embedding(output)
        x_encoder = self.encoder(input_embed)
        x = self.decoder(output_embed, x_encoder, causal_mask=self.ff_mask)
        x = self.linear(x)
        return x
    

    def generate(self, input, max_gen_length, start_token, end_token): #greed decoding
        self.eval()
        input_embed = self.embedding(input)
        x_encoder = self.encoder(input_embed)

        generated_tokens = [start_token]
        generated_tokens_probas = [1]

        for _ in range(max_gen_length):
            output = torch.tensor(generated_tokens).unsqueeze(0) # size [1, sequence_length]
            out_embed = self.embedding(output)
            causal_mask = self.ff_mask[:out_embed.size(1), :out_embed.size(1)]
            x = self.decoder(out_embed, x_encoder, causal_mask=causal_mask)
            x = self.linear(x)
            probas = torch.softmax(x, dim=-1)
            max_proba, next_token = torch.max(probas[:, -1, :], dim=-1) #greedy decoding : only max_proba
            generated_tokens.append(next_token.item())
            generated_tokens_probas.append(max_proba.item())
            if next_token == end_token:
                break
            
        return generated_tokens, generated_tokens_probas

# Tests

In [130]:
#Parameters
batch_size = 2
d_model = 128
max_length = 100
vocab_size = 20000
num_out = vocab_size
num_heads = 8
dv = 16
dk = 16
d_ff = 512
dropout = 0.1
num_encoders = 2
num_decoders = 2
dropout = 0.1

In [131]:
x = torch.randint(0, vocab_size, (batch_size, max_length))
print(x.shape)
MyEmbedding = Embedding(d_model, max_length, n_embedding=vocab_size, dropout=dropout)
x= MyEmbedding(x)
print(x.shape)

MyEncoder = Encoder(num_heads, dk, dv, d_ff, d_model, dropout, num_encoders)
x_encoder = MyEncoder(x)
print(x.shape)

MyDecoder = Decoder(num_heads, dk, dv, d_ff, d_model, dropout, num_decoders)
x = MyDecoder(x, x_encoder) 
print(x.shape)


torch.Size([2, 100])
torch.Size([2, 100, 128])
torch.Size([2, 100, 128])
torch.Size([2, 100, 128])


In [None]:
def init_transformer():
    model = Transformer(
        d_model=d_model,
        max_length=max_length,
        vocab_size=vocab_size,
        num_out=num_out,
        num_heads=num_heads,
        dv=dv,
        dk=dk,
        d_ff=d_ff,
        dropout=dropout,
        num_encoders=num_encoders,
        num_decoders=num_decoders,
    )



#######  Forward Test  #############
input = torch.randint(0, vocab_size, (batch_size, max_length))
output = torch.randint(0, vocab_size, (batch_size, max_length))

MyTransformer = init_transformer()
out = MyTransformer(input, output)
out.shape


####### Generation Test ######
input_seq = torch.tensor([[0, 2, 3, 4, 5, 1]])
start_token = 0 
end_token = 1

generated_seq, corresponding_probas = MyTransformer.generate(input_seq, max_gen_length=10, start_token=start_token, end_token=end_token)
print(generated_seq)


[0, 2719, 2719, 2719, 2719, 2719, 2719, 2719, 2719, 2719, 2719]


## Character Level Tokenizer

In [194]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MyTokenizer:
    def __init__(self, text):
        self.bos_token = "<"
        self.eos_token = ">"
        self.chars = sorted(list(set(text)))
        self.chars = [self.bos_token, self.eos_token] + self.chars
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.chars)}
        self.bos_token_id = self.char_to_idx[self.bos_token]
        self.eos_token_id = self.char_to_idx[self.eos_token]

    def encode(self, text):
        return [self.char_to_idx[char] for char in text]

    def decode(self, indices):
        return ''.join([self.idx_to_char[idx] for idx in indices if idx not in {self.bos_token_id, self.eos_token_id}])


In [185]:
class TextDataset(Dataset):
    def __init__(self, text, seq_length, tokenizer):
        self.text = text
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        self.data = self.preprocess(text)

    def preprocess(self, text):
        data = []
        for i in range(0, len(text) - self.seq_length):
            input_seq = text[i:i + self.seq_length]
            target_seq = text[i + 1:i + self.seq_length + 1]
            input_seq = self.tokenizer.bos_token + input_seq
            target_seq = target_seq + self.tokenizer.eos_token
            input_idx = self.tokenizer.encode(input_seq)
            target_idx = self.tokenizer.encode(target_seq)
            data.append((input_idx, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_idx, target_idx = self.data[idx]
        return torch.tensor(input_idx), torch.tensor(target_idx)

In [197]:

# Example usage
text = "hello world. this is a simple toy text dataset for training a transformer."
seq_length = 10
tokenizer = MyTokenizer(text)
dataset = TextDataset(text, seq_length, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)



In [188]:

# Define the Transformer model (assuming the Transformer class is already defined)
d_model = 128
max_length = seq_length + 1  # Adjust for start token
vocab_size = len(tokenizer.chars)
num_out = vocab_size
num_heads = 8
dv = 16
dk = 16
d_ff = 512
dropout = 0.1
num_encoders = 2
num_decoders = 2
num_epochs = 60
learning_rate = 0.001

model = Transformer(d_model, max_length, vocab_size, num_out, num_heads, dv, dk, d_ff, dropout, num_encoders, num_decoders)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_seq, target_seq in dataloader:
        optimizer.zero_grad()
        output = model(input_seq, input_seq)
        loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# Save the model
torch.save(model.state_dict(), "transformer_model.pth")


Epoch 1/60, Loss: 2.2776170186698437
Epoch 2/60, Loss: 1.3597143031656742
Epoch 3/60, Loss: 1.034982355311513
Epoch 4/60, Loss: 0.7859382070600986
Epoch 5/60, Loss: 0.6525621777400374
Epoch 6/60, Loss: 0.5793520892038941
Epoch 7/60, Loss: 0.48461533430963755
Epoch 8/60, Loss: 0.4736202759668231
Epoch 9/60, Loss: 0.3786097466945648
Epoch 10/60, Loss: 0.35829091630876064
Epoch 11/60, Loss: 0.341633356641978
Epoch 12/60, Loss: 0.3049099618801847
Epoch 13/60, Loss: 0.3518168574664742
Epoch 14/60, Loss: 0.35978076606988907
Epoch 15/60, Loss: 0.2855427381582558
Epoch 16/60, Loss: 0.23374111426528543
Epoch 17/60, Loss: 0.20292445935774595
Epoch 18/60, Loss: 0.22330564120784402
Epoch 19/60, Loss: 0.24253607168793678
Epoch 20/60, Loss: 0.22264130611438304
Epoch 21/60, Loss: 0.22580913186538965
Epoch 22/60, Loss: 0.20121996733359993
Epoch 23/60, Loss: 0.31154937646351755
Epoch 24/60, Loss: 0.2642110785818659
Epoch 25/60, Loss: 0.1764750420115888
Epoch 26/60, Loss: 0.20117109356215224
Epoch 27/60

In [199]:

# Load the model
model.load_state_dict(torch.load("transformer_model.pth", weights_only=True))
model.eval()

# Generate text

input_seq = tokenizer.encode(tokenizer.bos_token + "hello")


generated_text = model.generate(torch.tensor(input_seq), max_gen_length=50, start_token=tokenizer.bos_token_id, end_token=tokenizer.eos_token_id)
print("Input:", tokenizer.decode(input_seq))
print("Generated Text:", tokenizer.decode(generated_text[0]))

Input: hello
Generated Text: elowrdol.t


In [139]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MyTokenizer:
    def __init__(self, text):
        self.chars = sorted(list(set(text)))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.chars)}

    def encode(self, text):
        return [self.char_to_idx[char] for char in text]

    def decode(self, indices):
        return ''.join([self.idx_to_char[idx] for idx in indices])

class TextDataset(Dataset):
    def __init__(self, text, seq_length, tokenizer):
        self.text = text
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        self.data = self.preprocess(text)

    def preprocess(self, text):
        data = []
        for i in range(0, len(text) - self.seq_length):
            input_seq = text[i:i + self.seq_length]
            target_seq = text[i + 1:i + self.seq_length + 1]
            input_idx = self.tokenizer.encode(input_seq)
            target_idx = self.tokenizer.encode(target_seq)
            data.append((input_idx, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_idx, target_idx = self.data[idx]
        return torch.tensor(input_idx), torch.tensor(target_idx)

# Example usage
text = "hello world. this is a simple text dataset for training a transformer."
seq_length = 10
tokenizer = MyTokenizer(text)
dataset = TextDataset(text, seq_length, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Print some examples
for input_seq, target_seq in dataloader:
    print("Input:", tokenizer.decode(input_seq[0].tolist()))
    print("Target:", tokenizer.decode(target_seq[0].tolist()))
    break

Input: for traini
Target: or trainin


In [142]:
import torch.optim as optim
import torch.nn.functional as F

# Hyperparameters
d_model = 128
max_length = seq_length
vocab_size = len(tokenizer.chars)
num_out = vocab_size
num_heads = 8
dv = 16
dk = 16
d_ff = 512
dropout = 0.1
num_encoders = 2
num_decoders = 2
num_epochs = 20
learning_rate = 0.001

# Initialize model, loss function, and optimizer
model = Transformer(d_model, max_length, vocab_size, num_out, num_heads, dv, dk, d_ff, dropout, num_encoders, num_decoders)
# for p in model.parameters():
#     if p.dim() > 1:
#         nn.init.xavier_uniform_(p)
        
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_seq, target_seq in dataloader:
        optimizer.zero_grad()
        output = model(input_seq, input_seq)
        loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# Save the model
torch.save(model.state_dict(), "transformer_model.pth")

Epoch 1/20, Loss: 2.0158475716908772
Epoch 2/20, Loss: 0.9365063230196635
Epoch 3/20, Loss: 0.5599679509798686
Epoch 4/20, Loss: 0.4209852079550425
Epoch 5/20, Loss: 0.3388981262842814
Epoch 6/20, Loss: 0.23375614906350772
Epoch 7/20, Loss: 0.2032998559375604
Epoch 8/20, Loss: 0.2159905731678009
Epoch 9/20, Loss: 0.17053566339115303
Epoch 10/20, Loss: 0.1532356958836317
Epoch 11/20, Loss: 0.1635421346873045
Epoch 12/20, Loss: 0.1461115210627516
Epoch 13/20, Loss: 0.13610078835239012
Epoch 14/20, Loss: 0.1227427354703347
Epoch 15/20, Loss: 0.10143420770764351
Epoch 16/20, Loss: 0.0678825307947894
Epoch 17/20, Loss: 0.08694310352827112
Epoch 18/20, Loss: 0.09003200912848115
Epoch 19/20, Loss: 0.0856722991913557
Epoch 20/20, Loss: 0.12321084660167496


In [145]:
# Load the model
model.load_state_dict(torch.load("transformer_model.pth"))
model.eval()

# Generate text


start_text = "hello"
input_seq = tokenizer.encode("hello")
print(input_seq)


# generated_text = model.generate(input, start_token=0, 50, dataset)
# print("Generated Text:", generated_text)

[7, 4, 9, 9, 12]


  model.load_state_dict(torch.load("transformer_model.pth"))


## Tokenizer

In [128]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())

tokenizer.pre_tokenizer = Whitespace()
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = BpeTrainer(special_tokens=special_tokens)

tokenizer.train(["data.txt"], trainer)
tokenizer.save("bpe_tokenizer.json")

tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

with open('/users/eleves-b/2021/valentin.dorseuil/Desktop/ScratchML/transformers/data.txt', 'r') as file:
    lines = file.readlines()


chunk_size = 10
overlap_size = 5
chunks = [''.join(lines[i:i+chunk_size]) for i in range(0, len(lines) - chunk_size + 1, chunk_size - overlap_size)]

print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

encoded_text = tokenizer.encode_batch(chunks)
for token in special_tokens:
    token_id = tokenizer.token_to_id(token)
    print(f"Token: {token}, ID: {token_id}")

print(f"Number of chunks: {len(encoded_text)}")
print(f"Average chunk length (num tokens): {sum([len(chunk) for chunk in encoded_text])/len(encoded_text):.2f}")
print()

Exception: No such file or directory (os error 2)

## Data

In [99]:
import random

random.seed(123)
random.shuffle(chunks)

train_size = int(0.8 * len(chunks))
val_size = int(0.1 * len(chunks))
test_size = len(chunks) - train_size - val_size

train_chunks = chunks[:train_size]
val_chunks = chunks[train_size:train_size + val_size]
test_chunks = chunks[train_size + val_size:]

## Test

In [10]:


# Hyperparameters
batch_size = 16
model_dim = 512
max_length = 100
vocab_size = 32000
num_out = 4  # Four classes for AG News
num_heads = 8
dv = 64
dk = 64
d_ff = 2048
dropout = 0.1
num_encoders = 6
num_decoders = 6
num_epochs = 10
learning_rate = 0.001



# Prepare the AG News dataset
def prepare_data():
    TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)
    LABEL = Field(sequential=False, use_vocab=False)
    
    train_data, test_data = AG_NEWS.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, max_size=vocab_size)
    
    train_iterator, test_iterator = BucketIterator.splits(
        (train_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    )
    
    return train_iterator, test_iterator, TEXT.vocab

# Training loop
def train(model, train_iterator, num_epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        
        for batch in train_iterator:
            input_data, input_lengths = batch.text
            target_data = batch.label
            
            optimizer.zero_grad()
            output = model(input_data, target_data)
            
            loss = criterion(output, target_data)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_iterator):.4f}")

# if __name__ == "__main__":
#     MyTransformer = init_transformer()
#     train_iterator, test_iterator, vocab = prepare_data()
#     train(MyTransformer, train_iterator, num_epochs, learning_rate)

OSError: /users/eleves-b/2021/valentin.dorseuil/.local/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit21setUTF8DecodingIgnoreEb