In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, EncoderDecoderModel, BertConfig, Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [4]:
device = 'mps' # or none or cuda

In [5]:
books = load_dataset("opus_books", "en-fr")

In [6]:
books = books["train"].train_test_split(test_size=0.2)

In [7]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [9]:
print("Length of vocab:", tokenizer.vocab_size)
tokens = tokenizer.encode("Boy do i love NLP")
print(tokens)
tokenizer.decode(tokens)

Length of vocab: 30522
[101, 2879, 2079, 1045, 2293, 17953, 2361, 102]


'[CLS] boy do i love nlp [SEP]'

In [10]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        source_text = self.dataset[idx]["translation"]["en"]
        target_text = self.dataset[idx]["translation"]["fr"]

        source_encodings = self.tokenizer(source_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            'input_ids': source_encodings['input_ids'].flatten(),
            'attention_mask': source_encodings['attention_mask'].flatten(),
            'labels': target_encodings['input_ids'].flatten()
        }

# Create train and validation datasets
train_data = books['train']
valid_data = books['test']

train_dataset = TranslationDataset(train_data, tokenizer)
valid_dataset = TranslationDataset(valid_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

In [11]:
batch = next(iter(train_loader))
print("Each batch has these keys:", batch.keys())
print("Each value is a tensor of shape batchxd:", batch['input_ids'].shape)
print("An example input, label pair:")
print("English:", tokenizer.decode(batch['input_ids'][0]))
print("French:", tokenizer.decode(batch['labels'][0]))

Each batch has these keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Each value is a tensor of shape batchxd: torch.Size([16, 128])
An example input, label pair:
English: [CLS] mais continuez donc, qu ’ avez - vous fait apres cela? » [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
French: [CLS] go on, though. what did you do next? " [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [24]:
from transformers import EncoderDecoderModel

# Initialize the encoder and decoder as a single model
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

# Configurations for the model (Optional but useful for fine-tuning)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 128
model.config.no_repeat_ngram_size = 3
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

# Freeze BERT encoder layers to fine-tune only the decoder
for param in model.encoder.parameters():
    param.requires_grad = False


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.en

In [26]:
from transformers import TrainingArguments, Trainer

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./translation_output",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer
)


In [None]:
trainer.train()

In [None]:
# Evaluate the model
trainer.evaluate()

# Example translation
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Translate an example sentence
print(translate("This is a test sentence for translation."))


## Moving beyond Huggingface

We need more control

In [12]:
class TransformerMT(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, max_length=128):
        super(TransformerMT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, max_length, d_model))
        
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, 
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.embedding(src) + self.pos_encoder[:, :src.size(1), :]
        tgt_emb = self.embedding(tgt) + self.pos_encoder[:, :tgt.size(1), :]
        
        output = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask)
        output = self.fc_out(output)
        return self.softmax(output)

# Initialize the model
vocab_size = tokenizer.vocab_size
model = TransformerMT(vocab_size).to(device)


In [14]:
for param in model.named_parameters():
    print(param[0], param[1].shape)

('pos_encoder', Parameter containing:
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='mps:0', requires_grad=True))
('embedding.weight', Parameter containing:
tensor([[ 0.2251, -0.9637, -0.8250,  ..., -0.5261, -0.9404,  0.7296],
        [-1.4779, -0.0222,  1.1354,  ..., -1.9905, -0.6249,  1.6038],
        [ 0.3983, -1.1049, -0.8065,  ...,  0.1593, -0.3275,  1.0760],
        ...,
        [-1.4491,  0.2941,  0.5592,  ...,  1.3600,  1.3489, -1.4876],
        [ 0.9246,  1.0259, -1.4992,  ...,  0.6567, -1.4609,  2.1453],
        [-0.1629, -0.3713,  1.0547,  ...,  0.5326, -0.1309, -0.0901]],
       device='mps:0', requires_grad=True))
('transformer.encoder.layers.0.self_attn.in_proj_weight', Parameter containing:
tensor([[-0.0049, -0.0493, -0.0422,  ...,  0.0065,  0.0521, -0