### Final

In [1]:
import re
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import tensorflow as tf
import torch
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import AutoTokenizer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from torchinfo import summary

In [2]:
train_set = pd.read_csv('valid.csv')
test_set = pd.read_csv('test-curated.csv')

### Train Validation Split

In [3]:
SEED = 123
train_set, validation_set = train_test_split(train_set, test_size=0.2, random_state=SEED)

### Data Preprocessing - data cleaning and tokenization

In [4]:
DESC_COL = 'desc'
SLOGAN_COL = 'output'
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s.,?!']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
for df in [train_set, validation_set, test_set]:
    df[DESC_COL + '_cleaned'] = df[DESC_COL].apply(clean_text)
    df[SLOGAN_COL + '_cleaned'] = df[SLOGAN_COL].apply(clean_text)

In [6]:
tokenizer_name = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_text_with_transformers(text_series, tokenizer_model, max_len=128):
    encoded_inputs = tokenizer_model(
        text_series.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors=None,
        add_special_tokens=True
    )
    return encoded_inputs['input_ids']

for df in [train_set, validation_set, test_set]:
    df[DESC_COL + '_tokenized'] = tokenize_text_with_transformers(df[DESC_COL + '_cleaned'], tokenizer)
    df[SLOGAN_COL + '_tokenized'] = tokenize_text_with_transformers(df[SLOGAN_COL + '_cleaned'], tokenizer)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Build TF datasets

In [7]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 1000
desc_col = DESC_COL + '_tokenized'
slogan_col = SLOGAN_COL + '_tokenized'
# Build torch dataset
class SloganDataset(Dataset):
    def __init__(self, dataframe, desc_tokenizer, slogan_tokenizer, 
                 desc_col, slogan_col, 
                 max_desc_len, max_slogan_len,
                 bos_token_id, eos_token_id, pad_token_id):
        self.dataframe = dataframe
        self.desc_tokenizer = desc_tokenizer
        self.slogan_tokenizer = slogan_tokenizer
        self.desc_col = desc_col
        self.slogan_col = slogan_col
        self.max_desc_len = max_desc_len
        self.max_slogan_len = max_slogan_len
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        desc_text = str(row[self.desc_col])
        slogan_text = str(row[self.slogan_col])

        desc_encoded = self.desc_tokenizer.encode(desc_text, 
        add_special_tokens=False, 
        max_length=self.max_desc_len, 
        padding='max_length', 
        truncation=True)
        encoder_input_ids = torch.tensor(desc_encoded, dtype=torch.long)


        slogan_encoded = self.slogan_tokenizer.encode(slogan_text, add_special_tokens=False,
        max_length=self.max_slogan_len - 2,
        truncation=True) 
        slogan_ids = slogan_encoded

        decoder_input_ids = [self.bos_token_id] + slogan_ids
        decoder_target_ids = slogan_ids + [self.eos_token_id]

        return {
            'encoder_input_ids': torch.tensor(encoder_input_ids, dtype=torch.long),
            'decoder_input_ids': torch.tensor(decoder_input_ids, dtype=torch.long),
            'decoder_target_ids': torch.tensor(decoder_target_ids, dtype=torch.long)
        }

    def collate_fn(self, batch):
        encoder_inputs = pad_sequence([item['encoder_input_ids'] for item in batch], 
                                      batch_first=True, padding_value=self.pad_token_id)
        decoder_inputs = pad_sequence([item['decoder_input_ids'] for item in batch], 
                                      batch_first=True, padding_value=self.pad_token_id)

        padding_for_targets = self.pad_token_id if self.pad_token_id != -100 else -100

        decoder_targets = pad_sequence([item['decoder_target_ids'] for item in batch], 
                                       batch_first=True, padding_value=padding_for_targets)
        
        return {
            'encoder_input_ids': encoder_inputs,
            'decoder_input_ids': decoder_inputs,
            'decoder_target_ids': decoder_targets
        }


### Build Encoder-Decoder Model

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [9]:
def create_padding_mask_pt(seq, pad_token_id):
    return seq == pad_token_id

def create_src_padding_mask_pt(seq, pad_token_id):
    return seq == pad_token_id

def create_tgt_padding_mask_pt(seq, pad_token_id):
    return seq == pad_token_id

In [10]:
class Transformer(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, max_seq_length, dropout=0.1, batch_first=True):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.batch_first = batch_first

        self.source_embedding = nn.Embedding(input_vocab_size, d_model)
        self.target_embedding = nn.Embedding(target_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length, dropout)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=self.batch_first
        )
        self.fc_out = nn.Linear(d_model, target_vocab_size)

    def _generate_square_subsequent_mask(self, sz, device):
        mask = (torch.triu(torch.ones(sz, sz, device=device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask # shape (sz, sz)

    def forward(self, src, tgt, src_padding_mask=None, tgt_padding_mask=None, memory_key_padding_mask=None):
        src_emb = self.source_embedding(src) * math.sqrt(self.d_model)
        tgt_emb = self.target_embedding(tgt) * math.sqrt(self.d_model)

        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_encoder(tgt_emb)

        tgt_seq_len = tgt.size(1) if self.batch_first else tgt.size(0)
        tgt_mask = self._generate_square_subsequent_mask(tgt_seq_len, src.device)

        output = self.transformer(
            src_emb,
            tgt_emb,
            src_mask=None,
            tgt_mask=tgt_mask,
            memory_mask=None,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )
        
        return self.fc_out(output)

In [11]:
class CustomSchedulePT:
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.n_steps = 0

    def step(self):
        self.n_steps += 1
        lr = self._get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        return lr

    def _get_lr(self):
        current_step = float(self.n_steps)
        if self.d_model == 0: return 0.0
        factor = self.d_model ** -0.5
        arg1 = current_step ** -0.5
        if self.warmup_steps > 0:
            arg2 = current_step * (self.warmup_steps ** -1.5)
            return factor * min(arg1, arg2)
        else:
            return factor * arg1

In [12]:
EPOCHS_TRAIN = 20
num_layers = 6
d_model = 512
dff = 2048
num_heads = 8
dropout_rate = 0.1
MAX_DESC_LEN = 128
BATCH_FIRST = True
PAD_TOKEN_ID = tokenizer.pad_token_id
BOS_TOKEN_ID = tokenizer.bos_token_id
EOS_TOKEN_ID = tokenizer.eos_token_id
INPUT_VOCAB_SIZE = tokenizer.vocab_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
class Trainer:
    def __init__(
        self,
        model: nn.Module,
        slogan_tokenizer,
        dataset: Dataset,
        val_dataset: Dataset = None,
        batch_size: int = 32,
        lr: float = 1e-4,
        weight_decay: float = 0.0001,
        warmup_steps: int = 0,
        d_model: int = 128,
        device: str = "cpu",
        pad_token_id: int = 0
    ):
        self.device = device
        self.model = model.to(device)
        self.slogan_tokenizer = slogan_tokenizer
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.val_dataset = val_dataset
        self.pad_token_id = pad_token_id
        self.rouge_eval_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.bleu_smoothing_function = SmoothingFunction()
        self.train_loader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=0,
            collate_fn=getattr(dataset, 'collate_fn', None)
        )
        if self.val_dataset:
            self.val_loader = DataLoader(
                val_dataset,
                batch_size=batch_size,
                shuffle=False,
                num_workers=0,
                collate_fn=getattr(val_dataset, 'collate_fn', None)
            )

        self.optim = torch.optim.AdamW(self.model.parameters(), lr=lr, weight_decay=weight_decay, betas=(0.9, 0.98), eps=1e-9)
        self.lr_scheduler = None 
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_token_id if self.pad_token_id != -100 else -100)

    def train_epoch(self, epoch_num, total_epochs):
        self.model.train()
        total_loss = 0.0
        current_lr = self.optim.param_groups[0]['lr']
        # current_lr = 0
        progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch_num+1}/{total_epochs} [T]")
        
        for batch_data in progress_bar:
            src = batch_data['encoder_input_ids'].to(self.device)
            tgt_input = batch_data['decoder_input_ids'].to(self.device)
            tgt_real = batch_data['decoder_target_ids'].to(self.device)

            src_padding_mask = create_padding_mask_pt(src, self.pad_token_id)
            tgt_padding_mask = create_padding_mask_pt(tgt_input, self.pad_token_id)
            
            self.optim.zero_grad()

            logits = self.model(src, tgt_input, 
                                src_padding_mask=src_padding_mask, 
                                tgt_padding_mask=tgt_padding_mask,
                                memory_key_padding_mask=src_padding_mask)

            B, T, V = logits.shape
            loss = self.criterion(logits.reshape(B*T, V), tgt_real.reshape(B*T))
            
            loss.backward()
            self.optim.step()
            # current_lr = self.lr_scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=f"{loss.item():.4f}", lr=f"{current_lr:.7f}")

        avg_loss = total_loss / len(self.train_loader)
        return avg_loss, current_lr

    def evaluate_epoch(self, epoch_num, total_epochs):
        if not self.val_loader:
            return None, None, None
            
        self.model.eval()
        total_loss = 0.0

        all_predictions_text = [] 
        all_references_text = []
        
        progress_bar = tqdm(self.val_loader, desc=f"Epoch {epoch_num+1}/{total_epochs} [V]")

        printed_samples_this_epoch = False
        
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(progress_bar):
                src = batch_data['encoder_input_ids'].to(self.device)
                tgt_input = batch_data['decoder_input_ids'].to(self.device)
                tgt_real = batch_data['decoder_target_ids'].to(self.device)

                src_padding_mask = create_padding_mask_pt(src, self.pad_token_id)
                tgt_padding_mask = create_padding_mask_pt(tgt_input, self.pad_token_id)

                logits = self.model(src, tgt_input,
                                    src_padding_mask=src_padding_mask,
                                    tgt_padding_mask=tgt_padding_mask,
                                    memory_key_padding_mask=src_padding_mask)
                
                B, T, V = logits.shape
                loss = self.criterion(logits.reshape(B*T, V), tgt_real.reshape(B*T))
                total_loss += loss.item()
                progress_bar.set_postfix(loss=f"{loss.item():.4f}")

                predicted_ids_batch = torch.argmax(logits, dim=-1)

                current_batch_preds_for_metric = []
                current_batch_refs_for_metric = []
                
                for i in range(B):
                    raw_pred_ids = predicted_ids_batch[i].tolist()
                    
                    pred_ids_before_eos_truncation = raw_pred_ids[:]
                    
                    processed_pred_ids_for_decode = raw_pred_ids[:]
                    try:
                        eos_idx = raw_pred_ids.index(self.slogan_tokenizer.eos_token_id)
                        processed_pred_ids_for_decode = raw_pred_ids[:eos_idx]
                    except (ValueError, AttributeError): 
                        # ValueError: EOS token not found in the list
                        # AttributeError: if slogan_tokenizer or eos_token_id is missing
                        pass
                    final_pred_text = self.slogan_tokenizer.decode(processed_pred_ids_for_decode, skip_special_tokens=True)
                    current_batch_preds_for_metric.append(final_pred_text)

                    raw_ref_ids = tgt_real[i].tolist()
                    
                    filtered_ref_ids_for_decode = [
                        token_id for token_id in raw_ref_ids 
                        if token_id != self.pad_token_id and \
                           (not hasattr(self.slogan_tokenizer, 'eos_token_id') or token_id != self.slogan_tokenizer.eos_token_id)
                    ]
                    
                    final_ref_text = self.slogan_tokenizer.decode(filtered_ref_ids_for_decode, skip_special_tokens=True)
                    current_batch_refs_for_metric.append(final_ref_text)

                    if batch_idx == 0 and i < 3 and not printed_samples_this_epoch:
                        print(f"\n--- Epoch {epoch_num+1} Validation Sample {i} ---")
                        print(f"Pad Token ID: {self.pad_token_id}, EOS Token ID: {getattr(self.slogan_tokenizer, 'eos_token_id', 'N/A')}")
                        
                        print(f"  Raw Predicted IDs: {raw_pred_ids}")
                        
                        decoded_raw_pred_no_skip = self.slogan_tokenizer.decode(raw_pred_ids, skip_special_tokens=False)
                        print(f"  Decoded Raw Predicted (skip_special_tokens=False): '{decoded_raw_pred_no_skip}'")
                        
                        decoded_raw_pred_skip = self.slogan_tokenizer.decode(raw_pred_ids, skip_special_tokens=True)
                        print(f"  Decoded Raw Predicted (skip_special_tokens=True): '{decoded_raw_pred_skip}'")
                        
                        print(f"  Processed Predicted IDs (for final decode, after EOS cut): {processed_pred_ids_for_decode}")
                        print(f"  FINAL Decoded Prediction (for metric): '{final_pred_text}'")
                        
                        print(f"  Raw Reference IDs: {raw_ref_ids}")
                        decoded_raw_ref_no_skip = self.slogan_tokenizer.decode(raw_ref_ids, skip_special_tokens=False)
                        print(f"  Decoded Raw Reference (skip_special_tokens=False): '{decoded_raw_ref_no_skip}'")

                        print(f"  Processed Reference IDs (for final decode, after PAD/EOS filter): {filtered_ref_ids_for_decode}")
                        print(f"  FINAL Decoded Reference (for metric): '{final_ref_text}'")
                        print("--- End Sample ---")
                
                if batch_idx == 0:
                    printed_samples_this_epoch = True

                all_predictions_text.extend(current_batch_preds_for_metric)
                all_references_text.extend(current_batch_refs_for_metric)
        
        avg_loss = total_loss / len(self.val_loader)

        # ROUGE
        rouge_results = {'rouge1': [], 'rouge2': [], 'rougeL': []}
        if all_predictions_text and all_references_text:
            for pred_text, ref_text in zip(all_predictions_text, all_references_text):
                if not pred_text.strip():
                    rouge_results['rouge1'].append(0.0)
                    rouge_results['rouge2'].append(0.0)
                    rouge_results['rougeL'].append(0.0)
                else:
                    actual_scores = self.rouge_eval_scorer.score(ref_text, pred_text)
                    rouge_results['rouge1'].append(actual_scores['rouge1'].fmeasure)
                    rouge_results['rouge2'].append(actual_scores['rouge2'].fmeasure)
                    rouge_results['rougeL'].append(actual_scores['rougeL'].fmeasure)
            
            avg_rouge1 = np.mean(rouge_results['rouge1']) if rouge_results['rouge1'] else 0
            avg_rouge2 = np.mean(rouge_results['rouge2']) if rouge_results['rouge2'] else 0
            avg_rougeL = np.mean(rouge_results['rougeL']) if rouge_results['rougeL'] else 0
            avg_rouge_scores = {'rouge1': avg_rouge1, 'rouge2': avg_rouge2, 'rougeL': avg_rougeL}
        else:
            avg_rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

        # BLEU
        bleu_scores_list = []
        if all_predictions_text and all_references_text:
            for pred_text, ref_text in zip(all_predictions_text, all_references_text):
                ref_tokens = [self.slogan_tokenizer.tokenize(ref_text)]
                pred_tokens = self.slogan_tokenizer.tokenize(pred_text)
                
                if not pred_tokens:
                    bleu_scores_list.append(0.0)
                    continue

                score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=self.bleu_smoothing_function.method1)
                bleu_scores_list.append(score)
            
            avg_bleu_score = np.mean(bleu_scores_list) if bleu_scores_list else 0
        else:
            avg_bleu_score = 0

        return avg_loss, avg_rouge_scores, avg_bleu_score

    def train(self, epochs: int = 5, model_save_path: str = "my_transformer_model.pt"):
        best_val_loss = float('inf')
        for epoch in range(epochs):
            print(f"\n--- Epoch {epoch+1}/{epochs} ---")
            avg_train_loss, current_lr = self.train_epoch(epoch, epochs)
            print(f"Epoch {epoch+1}/{epochs} Avg Train Loss: {avg_train_loss:.4f}, LR: {current_lr:.7f}")

            avg_val_loss, avg_rouge_scores, avg_bleu_score = self.evaluate_epoch(epoch, epochs)

            avg_rouge_scores_pct = {k: v * 100 for k, v in avg_rouge_scores.items()}
            avg_bleu_score_pct   = avg_bleu_score * 100
            print(f"Epoch {epoch+1}/{epochs} Avg Validation Loss: {avg_val_loss:.4f}")
            print(f"  Avg ROUGE-1: {avg_rouge_scores_pct['rouge1']:.4f}, ROUGE-2: {avg_rouge_scores_pct['rouge2']:.4f}, ROUGE-L: {avg_rouge_scores_pct['rougeL']:.4f}")
            print(f"  Avg BLEU: {avg_bleu_score_pct:.4f}")
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                model_dir = os.path.dirname(model_save_path)
                if model_dir and not os.path.exists(model_dir):
                    os.makedirs(model_dir, exist_ok=True)
                torch.save(self.model.state_dict(), model_save_path)
                print(f"Model improved and saved to {model_save_path}")
        
        print("\nTraining finished.")
        print(f"Final model weights saved to {model_save_path} (if not overwritten by better validation scores).")

In [34]:
train_slogan_dataset = SloganDataset(
    train_set, tokenizer, tokenizer,
    DESC_COL, SLOGAN_COL,
    128, 128,
    BOS_TOKEN_ID, EOS_TOKEN_ID, PAD_TOKEN_ID
)
val_slogan_dataset = SloganDataset(
    validation_set, tokenizer, tokenizer,
    DESC_COL, SLOGAN_COL,
    128, 128,
    BOS_TOKEN_ID, EOS_TOKEN_ID, PAD_TOKEN_ID
)

In [35]:
model = Transformer(
    input_vocab_size=INPUT_VOCAB_SIZE,
    target_vocab_size=INPUT_VOCAB_SIZE,
    d_model=d_model,
    nhead=num_heads,
    num_encoder_layers=num_layers,
    num_decoder_layers=num_layers,
    dim_feedforward=dff,
    max_seq_length=MAX_DESC_LEN,
    dropout=dropout_rate,
    batch_first=BATCH_FIRST
).to(device)

In [36]:
trainer = Trainer(
    model=model,
    slogan_tokenizer=tokenizer,
    dataset=train_slogan_dataset,
    val_dataset=val_slogan_dataset,
    device=device,
    pad_token_id=PAD_TOKEN_ID
)

In [37]:
trainer.train(epochs=EPOCHS_TRAIN, model_save_path="final_ipynb_transformer.pt")


--- Epoch 1/20 ---


  'encoder_input_ids': torch.tensor(encoder_input_ids, dtype=torch.long),
Epoch 1/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 12.03it/s, loss=7.3349, lr=0.0001000]


Epoch 1/20 Avg Train Loss: 8.3161, LR: 0.0001000


Epoch 1/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.21it/s, loss=7.3272]


--- Epoch 1 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [8, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): ' and and and</s></s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): ' and and and'
  Processed Predicted IDs (for final decode, after EOS cut): [8, 8, 8]
  FINAL Decoded Prediction (for metric): ' and and and'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 1 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [8, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): ' and and

Epoch 1/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 37.96it/s, loss=6.9007]


Epoch 1/20 Avg Validation Loss: 7.6138
  Avg ROUGE-1: 4.2712, ROUGE-2: 0.0000, ROUGE-L: 4.2712
  Avg BLEU: 0.3703
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 2/20 ---


Epoch 2/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.95it/s, loss=7.2602, lr=0.0001000]


Epoch 2/20 Avg Train Loss: 7.1499, LR: 0.0001000


Epoch 2/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.24it/s, loss=7.1081]


--- Epoch 2 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The,</s></s></s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The,'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 6]
  FINAL Decoded Prediction (for metric): 'The,'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 2 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The and</s></s></s></s></

Epoch 2/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.15it/s, loss=6.6715]


Epoch 2/20 Avg Validation Loss: 7.4885
  Avg ROUGE-1: 3.4408, ROUGE-2: 0.0000, ROUGE-L: 3.4068
  Avg BLEU: 0.2254
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 3/20 ---


Epoch 3/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.94it/s, loss=6.1586, lr=0.0001000]


Epoch 3/20 Avg Train Loss: 6.8800, LR: 0.0001000


Epoch 3/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.42it/s, loss=7.0495]


--- Epoch 3 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 154, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'Theing,</s></s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'Theing,'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 154, 6]
  FINAL Decoded Prediction (for metric): 'Theing,'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 3 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The- and</s>

Epoch 3/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.82it/s, loss=6.4899]


Epoch 3/20 Avg Validation Loss: 7.4818
  Avg ROUGE-1: 4.0171, ROUGE-2: 0.0267, ROUGE-L: 3.9831
  Avg BLEU: 0.5122
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 4/20 ---


Epoch 4/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 12.00it/s, loss=6.2307, lr=0.0001000]


Epoch 4/20 Avg Train Loss: 6.7119, LR: 0.0001000


Epoch 4/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.21it/s, loss=6.9806]


--- Epoch 4 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
  Decoded Raw Predicted (skip_special_tokens=False): 'The-,,,,,,,,,,,,'
  Decoded Raw Predicted (skip_special_tokens=True): 'The-,,,,,,,,,,,,'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
  FINAL Decoded Prediction (for metric): 'The-,,,,,,,,,,,,'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 4 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False):

Epoch 4/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.61it/s, loss=6.5225]


Epoch 4/20 Avg Validation Loss: 7.4245
  Avg ROUGE-1: 2.4187, ROUGE-2: 0.0000, ROUGE-L: 2.4017
  Avg BLEU: 0.4804
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 5/20 ---


Epoch 5/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.98it/s, loss=6.6955, lr=0.0001000]


Epoch 5/20 Avg Train Loss: 6.5658, LR: 0.0001000


Epoch 5/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.67it/s, loss=6.9072]


--- Epoch 5 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The-,,</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The-,,'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 12, 6, 6]
  FINAL Decoded Prediction (for metric): 'The-,,'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 5 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7073, 8, 6, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'Theux and, and</

Epoch 5/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.78it/s, loss=6.3886]


Epoch 5/20 Avg Validation Loss: 7.4110
  Avg ROUGE-1: 4.4001, ROUGE-2: 0.1029, ROUGE-L: 4.3768
  Avg BLEU: 0.7840
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 6/20 ---


Epoch 6/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.96it/s, loss=6.7732, lr=0.0001000]


Epoch 6/20 Avg Train Loss: 6.4040, LR: 0.0001000


Epoch 6/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.31it/s, loss=6.8269]


--- Epoch 6 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 359, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The's & &</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The's & &'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 359, 359]
  FINAL Decoded Prediction (for metric): 'The's & &'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 6 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 34044, 154, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=Fal

Epoch 6/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 37.78it/s, loss=6.1550]


Epoch 6/20 Avg Validation Loss: 7.3140
  Avg ROUGE-1: 4.8130, ROUGE-2: 0.6967, ROUGE-L: 4.7753
  Avg BLEU: 1.1912
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 7/20 ---


Epoch 7/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.98it/s, loss=6.3719, lr=0.0001000]


Epoch 7/20 Avg Train Loss: 6.2163, LR: 0.0001000


Epoch 7/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.74it/s, loss=6.7392]


--- Epoch 7 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 13, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The's for and</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The's for and'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 13, 8]
  FINAL Decoded Prediction (for metric): 'The's for and'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 7 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=Fa

Epoch 7/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.37it/s, loss=6.0238]


Epoch 7/20 Avg Validation Loss: 7.2553
  Avg ROUGE-1: 6.1808, ROUGE-2: 0.7801, ROUGE-L: 6.0827
  Avg BLEU: 1.3788
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 8/20 ---


Epoch 8/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.94it/s, loss=6.3682, lr=0.0001000]


Epoch 8/20 Avg Train Loss: 6.0253, LR: 0.0001000


Epoch 8/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.40it/s, loss=6.6759]


--- Epoch 8 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 359, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The's & &</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The's & &'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 359, 359]
  FINAL Decoded Prediction (for metric): 'The's & &'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 8 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 359, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=Fals

Epoch 8/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.20it/s, loss=5.9460]


Epoch 8/20 Avg Validation Loss: 7.2225
  Avg ROUGE-1: 6.3472, ROUGE-2: 0.8921, ROUGE-L: 6.2862
  Avg BLEU: 1.6140
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 9/20 ---


Epoch 9/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.97it/s, loss=5.7503, lr=0.0001000]


Epoch 9/20 Avg Train Loss: 5.8235, LR: 0.0001000


Epoch 9/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.48it/s, loss=6.7549]


--- Epoch 9 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  Decoded Raw Predicted (skip_special_tokens=False): 'The- for for for for for for for for for for for for'
  Decoded Raw Predicted (skip_special_tokens=True): 'The- for for for for for for for for for for for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  FINAL Decoded Prediction (for metric): 'The- for for for for for for for for for for for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 9 Validation Sample 1 ---
Pad Token ID: 1, EOS

Epoch 9/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.84it/s, loss=5.8446]


Epoch 9/20 Avg Validation Loss: 7.3763
  Avg ROUGE-1: 7.8294, ROUGE-2: 1.1613, ROUGE-L: 7.6135
  Avg BLEU: 1.8743

--- Epoch 10/20 ---


Epoch 10/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.95it/s, loss=5.3504, lr=0.0001000]


Epoch 10/20 Avg Train Loss: 5.6264, LR: 0.0001000


Epoch 10/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.62it/s, loss=6.6133]


--- Epoch 10 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 9, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  Decoded Raw Predicted (skip_special_tokens=False): 'The's of and for for for for for for for for for for'
  Decoded Raw Predicted (skip_special_tokens=True): 'The's of and for for for for for for for for for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 9, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  FINAL Decoded Prediction (for metric): 'The's of and for for for for for for for for for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 10 Validation Sample 1 ---
Pad Token ID: 1, EOS T

Epoch 10/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.32it/s, loss=5.6708]


Epoch 10/20 Avg Validation Loss: 7.2604
  Avg ROUGE-1: 8.7848, ROUGE-2: 1.2648, ROUGE-L: 8.5784
  Avg BLEU: 2.1682

--- Epoch 11/20 ---


Epoch 11/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.95it/s, loss=5.0948, lr=0.0001000]


Epoch 11/20 Avg Train Loss: 5.4421, LR: 0.0001000


Epoch 11/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.87it/s, loss=6.6209]


--- Epoch 11 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 5, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The the to</s></s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The the to'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 5, 7]
  FINAL Decoded Prediction (for metric): 'The the to'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 11 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'T

Epoch 11/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.33it/s, loss=5.5501]


Epoch 11/20 Avg Validation Loss: 7.2854
  Avg ROUGE-1: 8.5160, ROUGE-2: 1.5708, ROUGE-L: 8.3887
  Avg BLEU: 2.1896

--- Epoch 12/20 ---


Epoch 12/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.85it/s, loss=5.5003, lr=0.0001000]


Epoch 12/20 Avg Train Loss: 5.2566, LR: 0.0001000


Epoch 12/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.04it/s, loss=6.4271]


--- Epoch 12 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 7003, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The'soles for</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The'soles for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 7003, 13]
  FINAL Decoded Prediction (for metric): 'The'soles for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 12 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 1260, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_spec

Epoch 12/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.27it/s, loss=5.5113]


Epoch 12/20 Avg Validation Loss: 7.0856
  Avg ROUGE-1: 8.3987, ROUGE-2: 1.3110, ROUGE-L: 8.1548
  Avg BLEU: 2.2597
Model improved and saved to final_ipynb_transformer.pt

--- Epoch 13/20 ---


Epoch 13/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.89it/s, loss=5.2900, lr=0.0001000]


Epoch 13/20 Avg Train Loss: 5.0847, LR: 0.0001000


Epoch 13/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.19it/s, loss=6.4974]


--- Epoch 13 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 636, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  Decoded Raw Predicted (skip_special_tokens=False): 'The'sic for for for for for for for for for for for'
  Decoded Raw Predicted (skip_special_tokens=True): 'The'sic for for for for for for for for for for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 636, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  FINAL Decoded Prediction (for metric): 'The'sic for for for for for for for for for for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 13 Validation Sample 1 ---
Pad Token ID: 1, EO

Epoch 13/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.25it/s, loss=5.4329]


Epoch 13/20 Avg Validation Loss: 7.1164
  Avg ROUGE-1: 9.4921, ROUGE-2: 1.8919, ROUGE-L: 9.1724
  Avg BLEU: 2.5145

--- Epoch 14/20 ---


Epoch 14/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.92it/s, loss=5.1789, lr=0.0001000]


Epoch 14/20 Avg Train Loss: 4.9086, LR: 0.0001000


Epoch 14/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.48it/s, loss=6.4964]


--- Epoch 14 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 18, 13, 359, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  Decoded Raw Predicted (skip_special_tokens=False): 'The's for & for for for for for for for for for for'
  Decoded Raw Predicted (skip_special_tokens=True): 'The's for & for for for for for for for for for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 18, 13, 359, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  FINAL Decoded Prediction (for metric): 'The's for & for for for for for for for for for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 14 Validation Sample 1 ---
Pad Token ID: 1, EO

Epoch 14/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.62it/s, loss=5.4397]


Epoch 14/20 Avg Validation Loss: 7.1406
  Avg ROUGE-1: 9.0271, ROUGE-2: 1.5711, ROUGE-L: 8.8182
  Avg BLEU: 2.2735

--- Epoch 15/20 ---


Epoch 15/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.95it/s, loss=4.7450, lr=0.0001000]


Epoch 15/20 Avg Train Loss: 4.7289, LR: 0.0001000


Epoch 15/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.36it/s, loss=6.4546]


--- Epoch 15 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 7, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to to for</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to to for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 7, 13]
  FINAL Decoded Prediction (for metric): 'The to to for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 15 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [27521, 12, 154, 1260, 359, 154, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_

Epoch 15/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 37.91it/s, loss=5.3698]


Epoch 15/20 Avg Validation Loss: 7.0904
  Avg ROUGE-1: 9.8447, ROUGE-2: 1.9642, ROUGE-L: 9.5174
  Avg BLEU: 2.4527

--- Epoch 16/20 ---


Epoch 16/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.90it/s, loss=4.8550, lr=0.0001000]


Epoch 16/20 Avg Train Loss: 4.5692, LR: 0.0001000


Epoch 16/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.69it/s, loss=6.4430]


--- Epoch 16 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 7, 7874, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to to Solutions</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to to Solutions'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 7, 7874]
  FINAL Decoded Prediction (for metric): 'The to to Solutions'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 16 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 1260, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predic

Epoch 16/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.10it/s, loss=5.2740]


Epoch 16/20 Avg Validation Loss: 7.1790
  Avg ROUGE-1: 9.5192, ROUGE-2: 1.7801, ROUGE-L: 9.3129
  Avg BLEU: 2.4312

--- Epoch 17/20 ---


Epoch 17/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.96it/s, loss=4.6434, lr=0.0001000]


Epoch 17/20 Avg Train Loss: 4.4021, LR: 0.0001000


Epoch 17/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 37.60it/s, loss=6.6024]


--- Epoch 17 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 7, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to to and</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to to and'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 7, 8]
  FINAL Decoded Prediction (for metric): 'The to to and'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 17 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 12, 154, 359, 359, 154, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_token

Epoch 17/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.24it/s, loss=5.3514]


Epoch 17/20 Avg Validation Loss: 7.3394
  Avg ROUGE-1: 9.7942, ROUGE-2: 1.7792, ROUGE-L: 9.4230
  Avg BLEU: 2.5135

--- Epoch 18/20 ---


Epoch 18/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.96it/s, loss=4.2437, lr=0.0001000]


Epoch 18/20 Avg Train Loss: 4.2317, LR: 0.0001000


Epoch 18/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.20it/s, loss=6.5205]


--- Epoch 18 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 13, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to for for</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 13, 13]
  FINAL Decoded Prediction (for metric): 'The to for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 18 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [19183, 12, 154, 359, 359, 359, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_spec

Epoch 18/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 37.94it/s, loss=5.1755]


Epoch 18/20 Avg Validation Loss: 7.2262
  Avg ROUGE-1: 9.9300, ROUGE-2: 1.9164, ROUGE-L: 9.5305
  Avg BLEU: 2.5762

--- Epoch 19/20 ---


Epoch 19/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.96it/s, loss=4.1838, lr=0.0001000]


Epoch 19/20 Avg Train Loss: 4.0761, LR: 0.0001000


Epoch 19/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.43it/s, loss=6.4738]


--- Epoch 19 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 7, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to to for</s></s></s></s></s></s></s></s></s></s>'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to to for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 7, 13]
  FINAL Decoded Prediction (for metric): 'The to to for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 19 Validation Sample 1 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 3777, 154, 359, 359, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  Decoded Raw Predicted (skip_special_tok

Epoch 19/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.17it/s, loss=5.2371]


Epoch 19/20 Avg Validation Loss: 7.1845
  Avg ROUGE-1: 9.7997, ROUGE-2: 1.9737, ROUGE-L: 9.5088
  Avg BLEU: 2.6386

--- Epoch 20/20 ---


Epoch 20/20 [T]: 100%|██████████| 134/134 [00:11<00:00, 11.96it/s, loss=3.9000, lr=0.0001000]


Epoch 20/20 Avg Train Loss: 3.9109, LR: 0.0001000


Epoch 20/20 [V]:  12%|█▏        | 4/34 [00:00<00:00, 38.85it/s, loss=6.5054]


--- Epoch 20 Validation Sample 0 ---
Pad Token ID: 1, EOS Token ID: 2
  Raw Predicted IDs: [133, 7, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  Decoded Raw Predicted (skip_special_tokens=False): 'The to for for for for for for for for for for for for'
  Decoded Raw Predicted (skip_special_tokens=True): 'The to for for for for for for for for for for for for'
  Processed Predicted IDs (for final decode, after EOS cut): [133, 7, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
  FINAL Decoded Prediction (for metric): 'The to for for for for for for for for for for for for'
  Raw Reference IDs: [32998, 52, 1119, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  Decoded Raw Reference (skip_special_tokens=False): 'Together we build</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
  Processed Reference IDs (for final decode, after PAD/EOS filter): [32998, 52, 1119]
  FINAL Decoded Reference (for metric): 'Together we build'
--- End Sample ---

--- Epoch 20 Validation Sample 1 ---
Pad Token ID: 

Epoch 20/20 [V]: 100%|██████████| 34/34 [00:00<00:00, 38.82it/s, loss=5.2467]


Epoch 20/20 Avg Validation Loss: 7.1819
  Avg ROUGE-1: 10.4006, ROUGE-2: 1.9492, ROUGE-L: 10.0961
  Avg BLEU: 2.6534

Training finished.
Final model weights saved to final_ipynb_transformer.pt (if not overwritten by better validation scores).


In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "final_ipynb_transformer.pt"

model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

Transformer(
  (source_embedding): Embedding(50265, 512)
  (target_embedding): Embedding(50265, 512)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, element

In [39]:
src = torch.randint(0, 50265, (64, 128)).to(device)
tgt = torch.randint(0, 50265, (64, 127)).to(device)

summary(model, 
        input_data=(src, tgt),
        depth=3,
        col_names=["input_size", "output_size", "num_params"],
        row_settings=["var_names"],
        col_width=18
        )

Layer (type (var_name))                                           Input Shape        Output Shape       Param #
Transformer (Transformer)                                         [64, 128]          [64, 127, 50265]   --
├─Embedding (source_embedding)                                    [64, 128]          [64, 128, 512]     25,735,680
├─Embedding (target_embedding)                                    [64, 127]          [64, 127, 512]     25,735,680
├─PositionalEncoding (pos_encoder)                                [64, 128, 512]     [64, 128, 512]     --
│    └─Dropout (dropout)                                          [64, 128, 512]     [64, 128, 512]     --
├─PositionalEncoding (pos_encoder)                                [64, 127, 512]     [64, 127, 512]     --
│    └─Dropout (dropout)                                          [64, 127, 512]     [64, 127, 512]     --
├─Transformer (transformer)                                       [64, 128, 512]     [64, 127, 512]     --
│    └─Transform

| Layer (name)                        | Output Shape      | # Params     |
|------------------------------------|-------------------|--------------|
| Embedding (source_embedding)       | [64, 128, 512]    | 25,735,680   |
| Embedding (target_embedding)       | [64, 127, 512]    | 25,735,680   |
| PositionalEncoding (src)           | [64, 128, 512]    | 0            |
| PositionalEncoding (tgt)           | [64, 127, 512]    | 0            |
| TransformerEncoder (6 layers)      | [64, 128, 512]    | 18,914,304   |
| TransformerDecoder (6 layers)      | [64, 127, 512]    | 25,224,192   |
| Linear (fc_out)                    | [64, 127, 50265]  | 25,785,945   |
| **Total**                          | —                 | **121,397,849** |


In [40]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Total parameters: 121,397,849
Trainable parameters: 121,397,849


In [42]:
def generate_slogan(description: str,
                    model,
                    tokenizer,
                    max_len: int,
                    bos_id: int,
                    eos_id: int,
                    device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Greedy‑decode one slogan from a single description string.
    """
    enc = tokenizer(description,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_len)
    src_ids   = enc["input_ids"].to(device)
    src_mask  = enc["attention_mask"].to(device)
    src_key_padding_mask = (src_mask == 0)

    dec_ids = torch.tensor([[bos_id]], dtype=torch.long, device=device)
    generated = [bos_id]

    model.eval()
    with torch.no_grad():
        for _ in range(max_len):
            logits = model(src=src_ids,
                           tgt=dec_ids,
                           src_padding_mask=src_key_padding_mask,
                           memory_key_padding_mask=src_key_padding_mask)

            next_id = torch.argmax(logits[:, -1, :], dim=-1)
            dec_ids = torch.cat([dec_ids, next_id.unsqueeze(0)], dim=1)
            generated.append(next_id.item())

            if next_id.item() == eos_id:
                break

    return tokenizer.decode(generated, skip_special_tokens=True).strip()

In [43]:
examples = ['Easily deliver personalized activities that enrich the lives of residents in older adult communities. Save time and increase satisfaction.',
'Powerful lead generation software that converts abandoning visitors into subscribers with our dynamic marketing tools and Exit Intent® technology.',
"Twine matches companies to the best digital and creative freelancers from a network of over 260,000. It's free to post a job and you only pay when you hire.",
"Looking for fresh web design & development? Need new marketing materials or a smart campaign to drive business? How about a video or updated photos? Let's talk and tell the world your story.",
# --- test-curated.csv
'Our expert team of Analytical Chemists provide eLiquid analysis & manufacturing services, ensuring full regulatory compliance for the e-cigarette market.',
'From placing entire software engineering teams to integrating easily into your current team, we offer bespoke placements of the very best engineers.',
'Turning ideas into visual content since 1999. Content Creation Studio in Ghent. Branded content - corporate video - visuals for events - 360 video',
'World market leader for robotic vision systems, inline measurement technology & inspection technology. We are your partner at over 25 locations worldwide.',
# --- other examples
'People and projects for sustainable change. Experts in sustainability recruitment, we recruit exceptional people into roles working on sustainability projects or in ethical and responsible organisations.']

In [22]:
example = train_set.iloc[3]["desc"]

In [44]:
for idx, desc in enumerate(examples, 1):
    slogan = generate_slogan(desc,
                             model=model,
                             tokenizer=tokenizer,
                             max_len=MAX_DESC_LEN,
                             bos_id=BOS_TOKEN_ID,
                             eos_id=EOS_TOKEN_ID,
                             device=device)
    print(f"{idx:02d}. {slogan}")

01. The Best of the best of the UK
02. The Best of the best of the best of the UK
03. The Best of the best of the UK
04. The Best Digital Marketing Agency in India
05. The Best's Best Digital Marketing
06. The Best of the best of the UK
07. The Best's leading UK of the UK
08. The Best's Best of Sale
09. The Best's Best of Sale


In [36]:
encoded_input = tokenizer(example, return_tensors="pt", padding=True, truncation=True, max_length=MAX_DESC_LEN)
source_tensor = encoded_input['input_ids'].to(device)
source_attention_mask = encoded_input['attention_mask'].to(device)

src_key_padding_mask = (source_attention_mask == 0)

target_tensor_input = torch.tensor([[BOS_TOKEN_ID]], dtype=torch.long, device=device)
generated_ids = [BOS_TOKEN_ID]

In [39]:
with torch.no_grad():
    for _ in range(MAX_DESC_LEN):
        tgt_mask = model._generate_square_subsequent_mask(target_tensor_input.size(1), device)
        output_logits = model(src=source_tensor, 
                              tgt=target_tensor_input, 
                              src_padding_mask=src_key_padding_mask, 
                              memory_key_padding_mask=src_key_padding_mask
                            #   tgt_mask=tgt_mask
                              )
        next_token_logits = output_logits[:, -1, :]
        predicted_token_id = torch.argmax(next_token_logits, dim=-1)
        target_tensor_input = torch.cat((target_tensor_input, predicted_token_id.unsqueeze(1)), dim=1)
        generated_ids.append(predicted_token_id.item())

        if predicted_token_id.item() == EOS_TOKEN_ID:
            break

In [41]:
slogan = tokenizer.decode(generated_ids, skip_special_tokens=True)

In [42]:
slogan

"The Best Best's Most Trusted in the UK"