# Assignment 3
## Training Decoder Only Model on Joke Dataset

In [37]:
import torch
import warnings
warnings.filterwarnings('ignore')
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import time
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import GPT2Tokenizer
import os
import gc
import json
import re
import html
import sentencepiece as spm

# Clean the joke strings

In [38]:
def clean_text(text):
    # 1. Normalize Unicode characters (Replace common Unicode characters)
    text = text.replace("\u201c", '"').replace("\u201d", '"').replace("\u2018", "'").replace("\u2019", "'")
    text = text.replace("\u2013", "-").replace("\u2026", "...").replace("\u2022", "*")  # Handle en dash, ellipsis, bullet points
    
    # 2. Decode HTML entities (e.g., "&amp;" becomes "&")
    text = html.unescape(text)
    
    # 3. Remove non-ASCII characters (if necessary, using a more aggressive approach)
    text = ''.join([c for c in text if ord(c) < 128])  # Keep only ASCII characters (lower-level cleaning)
    
    # 4. Remove unwanted symbols: Strip any non-letter, non-digit, non-punctuation characters
    text = re.sub(r'[^\w\s.,!?\'";:()-]', '', text)  # Keep common punctuation but remove others
    
    # 5. Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # 6. Remove leading and trailing spaces
    text = text.strip()
    
    # 7. Ensure proper sentence ending (optional, can be customized)
    if text and text[-1] not in ['.', '!', '?']:
        text += '.'  # Add period if no punctuation at the end

    return text

# Data Pre-processing
Insert `end_of_joke` and  `sep` tokens in the jokes to help the model learn better.

In [None]:
import os
import json

def structure_jokes_from_json(file_path):
    """
    Structure jokes from the original JSON file containing 'title' and 'body'.
    Assumes title is the question/prompt and body is the punchline/response.
    
    Args:
    - file_path (str): Path to the original JSON file containing the jokes.
    
    Returns:
    - list of structured jokes in the form of [{"joke": "title <|sep|> body <|endofjoke|>"}]
    - Saves the processed jokes in a JSON file named 'processed_jokes.json'.
    """
    def clean_text(text):
        return text.strip().replace("\n", " ")  # Basic text cleaning (add more if needed)

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    structured_jokes = []
    for entry in data:
        title = clean_text(entry["title"])
        body = clean_text(entry["body"])
        
        # Skip jokes that are too short
        if len(title.split()) < 3 or len(body.split()) < 3:
            continue
        
        # Structure joke as 'title <|sep|> body <|endofjoke|>'
        joke = f"{title} <|sep|> {body} <|endofjoke|>"
        structured_jokes.append({"joke": joke})
    
    # Save structured jokes to a JSON file
    output_file = os.path.join(os.path.dirname(file_path), "processed_jokes_separated.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(structured_jokes, f, indent=4, ensure_ascii=False)
    
    print(f"Processed jokes saved to: {output_file}")
    return structured_jokes


In [44]:
structure_jokes_from_json("../data/reddit_jokes.json")

Processed jokes saved to: ../data/processed_jokes_separated.json


[{'joke': 'I hate how you cant even say black paint anymore <|sep|> Now I have to say "Leroy can you please paint the fence?" <|endofjoke|>'},
 {'joke': "What's the difference between a Jew in Nazi Germany and pizza ? <|sep|> Pizza doesn't scream when you put it in the oven .  I'm so sorry. <|endofjoke|>"},
 {'joke': 'I recently went to America.... <|sep|> ...and being there really helped me learn about American culture. So I visited a shop and as I was leaving, the Shopkeeper said "Have a nice day!" But I didn\'t so I sued him. <|endofjoke|>'},
 {'joke': "Brian raises his hand and says, “He’s in Heaven.” <|sep|> A Sunday school teacher is concerned that his students might be a little confused about Jesus, so he asks his class, “Where is Jesus today?” Brian raises his hand and says, “He’s in Heaven.”  Susan answers, “He’s in my heart.”  Little Johnny waves his hand furiously and blurts out, “He’s in our bathroom!”  The teacher is surprised by this answer and asks Little Johnny how he k

# Model Parameters

In [None]:
DIMENSIONS = 100
BATCH_SIZE = 8
NUM_HEADS = 10
NUM_LAYERS = 4
D_FF = 2048
DROPOUT = 0.2
LEARNING_RATE = 1e-3
NUM_OF_EPOCHS = 10 

# Tokenize and Vocabulary

Using the Sentence Piece tokenizer I am creating my own vocabulary to train the decoder only model

In [46]:
import sentencepiece as spm
import pandas as pd

# Load your joke dataset (JSON example)
df = pd.read_json('../data/processed_jokes_separated.json')  # Replace with the path to your JSON dataset

# Combine the title and body (adjust the column names based on your dataset)
texts = df['joke'] # Modify column names if they differ


In [47]:
# Check your data statistics
print(f"Total number of jokes: {len(df)}")
print(f"Average joke length: {df['joke'].str.len().mean()}")
print(f"Max joke length: {df['joke'].str.len().max()}")

Total number of jokes: 155633
Average joke length: 293.86058869262945
Max joke length: 39764


In [49]:

# Save the combined text to a file for SentencePiece training
text_file = 'jokes.txt'

with open(text_file, 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')  # Write each joke text in a new line
# Adjusted parameters
VOCAB_SIZE = 10000 # Reduced to be under the maximum limit
MAX_SEQ_LEN = 256
INPUT_SENTENCE_SIZE = 200000

# Training the tokenizer
spm.SentencePieceTrainer.train(
    input=text_file,
    model_prefix='jokes_model_separated',
    vocab_size=VOCAB_SIZE,
    pad_id=3,
    unk_id=0,
    bos_id=1,
    eos_id=2,
    input_sentence_size=INPUT_SENTENCE_SIZE,
    character_coverage=0.9995,
    max_sentence_length=MAX_SEQ_LEN,
    shuffle_input_sentence=True
)

print("SentencePiece model trained successfully!")


SentencePiece model trained successfully!


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: jokes.txt
  input_format: 
  model_prefix: jokes_model_separated
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 200000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 256
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: 3
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_priv

In [51]:
sp = spm.SentencePieceProcessor(model_file='jokes_model_separated.model')
sp.load('jokes_model_separated.model')

True

# Embeddings 

I am using the pretrained Glove.6B.100d embeddings in my decoder only model. The embedding dimension is 100 and I keep it unchanged. 

In [52]:
import torch
import numpy as np

def load_pretrained_embeddings(sp_tokenizer, glove_path, embedding_dim=100):
    """Load pre-trained GloVe embeddings"""
    try:
        # Initialize the embedding tensor with random values
        embeddings = torch.randn(len(sp_tokenizer), embedding_dim)
        found_tokens = 0
        total_tokens = len(sp_tokenizer)

        # Load GloVe embeddings into a dictionary
        glove_embeddings = {}
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                glove_embeddings[word] = vector

        # Assign embeddings to the tokenizer
        for i in range(total_tokens):
            token = sp_tokenizer.id_to_piece(i)  # Get token from tokenizer
            token = token.replace("▁", "")  # Handle tokenization format (subword tokenization)

            # Assign the GloVe vector to the token if it exists in the GloVe embeddings
            if token in glove_embeddings:
                embeddings[i] = torch.tensor(glove_embeddings[token], dtype=torch.float32)
                found_tokens += 1
            else:
                # Random initialization for tokens not found in GloVe
                embeddings[i] = torch.randn(embedding_dim)

        print(f"Found embeddings for {found_tokens}/{total_tokens} tokens")
        return embeddings

    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


In [53]:
# Your sentencepiece tokenizer
glove_path = '../other/glove.6B.100d.txt'  # Path to your GloVe file

glove_embeddings = load_pretrained_embeddings(sp, glove_path, embedding_dim=100)


Found embeddings for 6770/10000 tokens


# Class for Dataset

In [54]:
import torch
import sentencepiece as spm
import json
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class for your joke dataset
class JokeDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_seq_length, model_file):
        # dataset is a list of dictionaries now, not a file path
        self.dataset = dataset  # Now accepts a list of data (train, val, or test split)
        self.max_seq_length = max_seq_length
        self.sp = spm.SentencePieceProcessor(model_file=model_file)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        joke = self.dataset[idx]["joke"]
        # Encode the joke text into token IDs
        encoded = self.sp.encode_as_ids(joke, add_bos=True, add_eos=True)
        return torch.tensor(encoded)

    def collate_function(self, batch):
        return torch.nn.utils.rnn.pad_sequence([item for item in batch], batch_first=True, padding_value=3)

In [55]:
file_path = '../data/processed_jokes_separated.json'  # Path to your JSON dataset
max_seq_length = MAX_SEQ_LEN # Define the maximum sequence length for your model
model_file = './jokes_model_10000.model'  # Path to your Sentence

from sklearn.model_selection import train_test_split

# Dataloaders

In [56]:
# Load the dataset from the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    full_dataset = json.load(f)

# Split the dataset into train, validation, and test sets
train_data, temp_data = train_test_split(full_dataset, test_size=0.2, random_state=42)  # 80% train, 20% temp
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # Split temp data into 50% val and 50% test

# Create dataset instances for each split
train_dataset = JokeDataset(train_data, max_seq_length, model_file)
val_dataset = JokeDataset(val_data, max_seq_length, model_file)
test_dataset = JokeDataset(test_data, max_seq_length, model_file)

# Create DataLoader instances for each split
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=train_dataset.collate_function, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=val_dataset.collate_function, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=test_dataset.collate_function, shuffle=False)

# # Example: Print a batch of tokenized jokes for train
# for batch in train_dataloader:
#     print(batch.shape)
#     break  # Just print one batch as an example

# Sample joke from the dataset

In [None]:
joke = test_data[2]['joke']
print(joke)

I dreamt I was forced to eat a giant marshmallow...... <|sep|> I dreamt I was forced to eat a giant marshmallow. When I woke up, my pillow was gone. <|endofjoke|>


# Define the Positional Encoding

In [58]:
import torch
import torch.nn as nn
import math

#implement positional encoder
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# MultiHeadAttention Class

In [59]:
import torch
import torch.nn as nn
import math

#implement multihead attention mechanism
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        #create queries, keys, values and output
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    #impelement attention formula
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    #function to split input values into multi-head format
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    #function to concatenate multihead attention values together
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

# Feed Forward Class

In [60]:
import torch
import torch.nn as nn

#define decoder feedforward layers
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Decoder Class

In [61]:
import torch
import torch.nn as nn

#define decoder layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Full Decoder Model

In [62]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#define and implement decoder only transformer architecture
class Transformer(nn.Module):
    def __init__(self, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, pre_trained_embeddings=None):
        super(Transformer, self).__init__()

        # Use pre-trained embeddings if available, otherwise initialize randomly
        if pre_trained_embeddings is not None:
            self.decoder_embedding = nn.Embedding.from_pretrained(pre_trained_embeddings, freeze=True)  # Optional: Set freeze=True if you want to keep embeddings fixed
        else:
            self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Decoder layers
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        # Final linear layer to project the output to vocabulary size
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    # Function to generate look-ahead mask for autoregressive decoding
    def generate_mask(self, tgt):
        tgt_mask = (tgt != 3).unsqueeze(1).unsqueeze(3)  # sentencepiece pad_id = 3
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return tgt_mask

    def forward(self, tgt):
        # Generate mask for autoregressive decoding
        tgt_mask = self.generate_mask(tgt)

        # Apply embeddings and positional encoding
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        # Pass through the decoder layers
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, tgt_mask)

        # Output projection to vocab size
        output = self.fc(dec_output)
        return output

# Initialize the model and optimizer

In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import sentencepiece as spm
import tqdm
from datetime import datetime

transformer = Transformer(len(sp), DIMENSIONS, NUM_HEADS, NUM_LAYERS, D_FF, MAX_SEQ_LEN, DROPOUT, pre_trained_embeddings=glove_embeddings).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=3) # sentencepiece pad_id = 3
optimizer = optim.Adam(transformer.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

# Training Loop

In [70]:
def train_model(model, train_dataloader, val_dataloader, test_dataloader,
                optimizer, criterion, device, num_epochs=NUM_OF_EPOCHS):

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        patience=2,
        factor=0.5,
        verbose=True
    )

    best_val_loss = float('inf')
    training_stats = []

    print("Starting training...")
    print(f"Training on device: {device}")

    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        total_loss = 0
        num_batches = len(train_dataloader)

        progress_bar = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch_idx, tgt_data in enumerate(progress_bar):
            # Clear gradients
            optimizer.zero_grad()

            # Ensure data is within sequence length limit
            if tgt_data.size(1) > MAX_SEQ_LEN:
                tgt_data = tgt_data[:, :MAX_SEQ_LEN]

            # Move data to device
            tgt_data = tgt_data.to(device)

            try:
                # Forward pass
                output = model(tgt_data[:, :-1])

                # Calculate loss
                loss = criterion(
                    output.contiguous().view(-1, VOCAB_SIZE),
                    tgt_data[:, 1:].contiguous().view(-1)
                )

                # Backward pass
                loss.backward()

                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # Update embeddings only if not frozen
                if not model.decoder_embedding.weight.requires_grad:
                    model.decoder_embedding.weight.grad = None

                # Optimize
                optimizer.step()

                # Update total loss
                total_loss += loss.item()

                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{total_loss/(batch_idx+1):.4f}',
                    'lr': f'{optimizer.param_groups[0]["lr"]:.6f}'
                })

            except RuntimeError as e:
                print(f"Error in batch {batch_idx}: {e}")
                continue

        avg_train_loss = total_loss / num_batches

        # Validation Phase
        model.eval()
        val_loss = 0
        num_val_batches = len(val_dataloader)

        print("\nRunning validation...")
        with torch.no_grad():
            for val_data in tqdm.tqdm(val_dataloader, desc="Validation"):
                if val_data.size(1) > MAX_SEQ_LEN:
                    val_data = val_data[:, :MAX_SEQ_LEN]

                val_data = val_data.to(device)
                output = model(val_data[:, :-1])
                loss = criterion(
                    output.contiguous().view(-1, VOCAB_SIZE),
                    val_data[:, 1:].contiguous().view(-1)
                )
                val_loss += loss.item()

        avg_val_loss = val_loss / num_val_batches

        # Learning rate scheduling
        scheduler.step(avg_val_loss)

        # Save statistics
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'learning_rate': optimizer.param_groups[0]['lr']
        })

        # Print epoch results
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            print(f"New best validation loss: {best_val_loss:.4f}")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
                'embedding_state': model.decoder_embedding.state_dict()
            }, f'../models/custom_separated_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}.pt')

        # Regular checkpoint
        if (epoch + 1) % 5 == 0:  # Save every 5 epochs
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
                'embedding_state': model.decoder_embedding.state_dict(),
                'training_stats': training_stats
            }, f'checkpoint_epoch_{epoch+1}.pt')

    # Final Test Phase
    print("\nTraining completed. Running final test evaluation...")
    model.eval()
    test_loss = 0
    num_test_batches = len(test_dataloader)

    with torch.no_grad():
        for test_data in tqdm.tqdm(test_dataloader, desc="Testing"):
            if test_data.size(1) > MAX_SEQ_LEN:
                test_data = test_data[:, :MAX_SEQ_LEN]

            test_data = test_data.to(device)
            output = model(test_data[:, :-1])
            loss = criterion(
                output.contiguous().view(-1, VOCAB_SIZE),
                test_data[:, 1:].contiguous().view(-1)
            )
            test_loss += loss.item()

    avg_test_loss = test_loss / num_test_batches
    print(f"Final Test Loss: {avg_test_loss:.4f}")

    return training_stats

In [71]:
train_model(transformer, train_dataloader, val_dataloader,test_dataloader, optimizer, criterion, device, NUM_OF_EPOCHS)

Starting training...
Training on device: cuda


Epoch 1/10: 100%|██████████| 15564/15564 [05:23<00:00, 48.06it/s, loss=4.1600, avg_loss=4.2205, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:14<00:00, 138.52it/s]



Epoch 1/10
Average Training Loss: 4.2205
Average Validation Loss: 4.0775
Learning Rate: 0.001000
New best validation loss: 4.0775


Epoch 2/10: 100%|██████████| 15564/15564 [05:14<00:00, 49.42it/s, loss=4.8939, avg_loss=4.1838, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:13<00:00, 144.94it/s]



Epoch 2/10
Average Training Loss: 4.1838
Average Validation Loss: 4.0615
Learning Rate: 0.001000
New best validation loss: 4.0615


Epoch 3/10: 100%|██████████| 15564/15564 [05:06<00:00, 50.77it/s, loss=4.6351, avg_loss=4.1610, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:13<00:00, 142.92it/s]



Epoch 3/10
Average Training Loss: 4.1610
Average Validation Loss: 4.0379
Learning Rate: 0.001000
New best validation loss: 4.0379


Epoch 4/10: 100%|██████████| 15564/15564 [05:01<00:00, 51.57it/s, loss=4.0231, avg_loss=4.1464, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:12<00:00, 157.11it/s]



Epoch 4/10
Average Training Loss: 4.1464
Average Validation Loss: 4.0398
Learning Rate: 0.001000


Epoch 5/10: 100%|██████████| 15564/15564 [04:58<00:00, 52.14it/s, loss=3.9925, avg_loss=4.1337, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:13<00:00, 149.44it/s]



Epoch 5/10
Average Training Loss: 4.1337
Average Validation Loss: 4.0074
Learning Rate: 0.001000
New best validation loss: 4.0074


Epoch 6/10: 100%|██████████| 15564/15564 [05:17<00:00, 49.04it/s, loss=4.3450, avg_loss=4.1266, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:13<00:00, 139.69it/s]



Epoch 6/10
Average Training Loss: 4.1266
Average Validation Loss: 4.0097
Learning Rate: 0.001000


Epoch 7/10: 100%|██████████| 15564/15564 [05:28<00:00, 47.43it/s, loss=3.4944, avg_loss=4.1219, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:14<00:00, 138.85it/s]



Epoch 7/10
Average Training Loss: 4.1219
Average Validation Loss: 3.9965
Learning Rate: 0.001000
New best validation loss: 3.9965


Epoch 8/10: 100%|██████████| 15564/15564 [05:29<00:00, 47.24it/s, loss=2.8279, avg_loss=4.1196, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:14<00:00, 133.48it/s]



Epoch 8/10
Average Training Loss: 4.1196
Average Validation Loss: 3.9951
Learning Rate: 0.001000
New best validation loss: 3.9951


Epoch 9/10: 100%|██████████| 15564/15564 [05:26<00:00, 47.65it/s, loss=3.1810, avg_loss=4.1192, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:14<00:00, 138.88it/s]



Epoch 9/10
Average Training Loss: 4.1192
Average Validation Loss: 3.9914
Learning Rate: 0.001000
New best validation loss: 3.9914


Epoch 10/10: 100%|██████████| 15564/15564 [05:17<00:00, 49.01it/s, loss=3.6716, avg_loss=4.1160, lr=0.001000]



Running validation...


Validation: 100%|██████████| 1946/1946 [00:13<00:00, 145.20it/s]



Epoch 10/10
Average Training Loss: 4.1160
Average Validation Loss: 4.0040
Learning Rate: 0.001000

Training completed. Running final test evaluation...


Testing: 100%|██████████| 1946/1946 [00:13<00:00, 148.15it/s]

Final Test Loss: 4.0050





[{'epoch': 1,
  'train_loss': 4.220533724356298,
  'val_loss': 4.077464536797229,
  'learning_rate': 0.001},
 {'epoch': 2,
  'train_loss': 4.183756681188339,
  'val_loss': 4.0615309446223105,
  'learning_rate': 0.001},
 {'epoch': 3,
  'train_loss': 4.161027529756811,
  'val_loss': 4.037873423234287,
  'learning_rate': 0.001},
 {'epoch': 4,
  'train_loss': 4.146352089499177,
  'val_loss': 4.039787588604675,
  'learning_rate': 0.001},
 {'epoch': 5,
  'train_loss': 4.133659765641323,
  'val_loss': 4.007425496536561,
  'learning_rate': 0.001},
 {'epoch': 6,
  'train_loss': 4.126628857120331,
  'val_loss': 4.009669998444364,
  'learning_rate': 0.001},
 {'epoch': 7,
  'train_loss': 4.121901280513554,
  'val_loss': 3.996479693321628,
  'learning_rate': 0.001},
 {'epoch': 8,
  'train_loss': 4.119641777544455,
  'val_loss': 3.995092773118955,
  'learning_rate': 0.001},
 {'epoch': 9,
  'train_loss': 4.119187780316278,
  'val_loss': 3.9914432556999184,
  'learning_rate': 0.001},
 {'epoch': 10,
  

In [88]:
import torch
import sentencepiece as spm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate(text, model_path):
    # Load pretrained SentencePiece tokenizer model
    sp = spm.SentencePieceProcessor(model_file='jokes_model_separated.model')
    checkpoint = torch.load(model_path)

    # Tokenize input text (add beginning-of-sequence token)
    encodedSentence = torch.tensor(sp.encode_as_ids(text, add_bos=True)).long().unsqueeze(0).to(device)

    # Instantiate the Transformer model
    transformer = Transformer(
        VOCAB_SIZE,
        DIMENSIONS,
        NUM_HEADS,
        NUM_LAYERS,
        D_FF,
        MAX_SEQ_LEN,
        DROPOUT
    ).to(device)

    transformer.load_state_dict(checkpoint['model_state_dict'])
    transformer.eval()  # Set the model to evaluation mode


    # Generate a 20-word output
    for _ in range(50):
        with torch.no_grad():
            logits = transformer(encodedSentence)  # Get output from the transformer
            logits = logits[:, -1, :] / 1.0  # Scaling logits (temperature can be applied here)

            # Apply softmax to get probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1)

            # Sample the next token based on the probabilities
            next_token = torch.multinomial(probs, num_samples=1)


            # Stop generation if the end-of-sequence token is generated (adjust token id if necessary)
            if next_token.item() == sp.piece_to_id('</s>'):  # Assuming <eos> is the EOS token
                break

            # Append the generated token to the input sequence for next step
            encodedSentence = torch.cat([encodedSentence, next_token], dim=1)

    # Decode the output tokens back to text
    output = sp.decode(encodedSentence.tolist()[0])

    # Print the generated joke/story
    print(f"{text} - {output}")

    return {"joke": output}


# Generating jokes on test dataset

In [129]:
import torch
import json
from tqdm import tqdm

def generate_test_jokes(test_data, model_path, sp_model_path, transformer_model, max_length):
    """
    Generate jokes using a model for all samples in the test dataset.

    Args:
        test_dataloader (DataLoader): DataLoader for the test dataset.
        model_path (str): Path to the model checkpoint.
        sp_model_path (str): Path to the SentencePiece model.
        transformer_model (torch.nn.Module): Transformer model class instance.
        output_file (str): Path to save the generated jokes as JSON.
        max_length (int): Maximum length of generated jokes.

    Returns:
        None
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    
    # Load the model
    checkpoint = torch.load(model_path)
    transformer = transformer_model.to(device)
    transformer.load_state_dict(checkpoint['model_state_dict'])
    transformer.eval()

    # Store results
    results = []

    print("Generating jokes for the test dataset...\n")
    for batch in (test_data[:5]):  # Limit to 5 batches for demonstration
        input_texts = batch['joke'] # Assuming the dataset outputs input texts as 'input_texts
        trimmed_text = ' '.join(input_texts.split()[:8])
        encoded_sentence = torch.tensor(sp.encode_as_ids(trimmed_text, add_bos=True)).long().unsqueeze(0).to(device)

        # Generate text
        for _ in range(max_length):
            with torch.no_grad():
                logits = transformer(encoded_sentence)
                logits = logits[:, -1, :]  # Take logits of the last token
                probs = torch.nn.functional.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)

                # Stop generation on <eos> token
                if next_token.item() == sp.piece_to_id('<eos>'):
                    break

                # Append the next token and continue
                encoded_sentence = torch.cat([encoded_sentence, next_token], dim=1)

        # Decode the generated tokens
        generated_text = sp.decode(encoded_sentence.tolist()[0])
        results.append({"input": trimmed_text, "generated": generated_text})

        print(f"Input: {trimmed_text} | Generated: {generated_text} \n")

generate_test_jokes(test_data, '../models/best_separated_model.pt', 'jokes_model_separated.model', Transformer(VOCAB_SIZE, DIMENSIONS, NUM_HEADS, NUM_LAYERS, D_FF, MAX_SEQ_LEN, DROPOUT), max_length=50)

Generating jokes for the test dataset...

Input: I dreamt I was forced to eat a | Generated: I dreamt I was forced to eat abilion <|sep|> He's now parked... <|endofjoke|>s. <|endofjoke|>|>|>|> rest|>sual....endofjoke|>ug|>udi|> 

Input: A penguin driving down the road... <|sep|> So, | Generated: A penguin driving down the road... <|sep|> So, when he was staying an airplane that hears up 50,000 on the mountain in a board in greenhole and shook with his tongue in the club and in an hotel. He finishes the bike next to the second way No one camera 

Input: What's green, fuzzy and would kill you if | Generated: What's green, fuzzy and would kill you if you're once wish it <|sep|> 'll be White 7 weeks and swallow. <|endofjoke|> a little one job for anothers blowup with 95 million bucks <|endofjoke|>. <|endofjoke|> 

Input: Why can't you play Uno with a Mexican? | Generated: Why can't you play Uno with a Mexican? <|sep|> Because we are stupid people who is positive. <|endofjoke|>endofjoke|>|>

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Use the model
def generate_jokes_on_test_dataset(test_dataset):
    model = GPT2LMHeadModel.from_pretrained("../gpt2-joke-model")
    tokenizer = GPT2Tokenizer.from_pretrained("../gpt2-joke-model")
    print(f"Generating jokes using the fine-tuned GPT-2 model...")
    results = []
    for batch in (test_dataset[:5]):
        input_text = batch['joke']
        trimmed_text = ' '.join(input_text.split()[:10]) # only use the first 5 words so that the model can generate the rest

        encoded_joke = tokenizer(trimmed_text, return_tensors="pt")
        outputs = model.generate(**encoded_joke, max_length=100)

        print(f"Input: {trimmed_text}, Generated Joke: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")

generate_jokes_on_test_dataset(test_data)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating jokes using the fine-tuned GPT-2 model...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: I dreamt I was forced to eat a giant marshmallow......, Generated Joke: I dreamt I was forced to eat a giant marshmallow...... <|sep|> But I woke up with a giant marshmallow on my face. <|endofjoke|>



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: A penguin driving down the road... <|sep|> So, a penguin, Generated Joke: A penguin driving down the road... <|sep|> So, a penguin driving down the road is driving down the road and sees a sign that says "Parking Only". So he goes to the nearest parking place and asks the driver if he can park. The driver says "Sure, I'll park right here." So the penguin goes to the nearest parking place and asks the driver if he can park. The driver says "Sure, I'll park right here." So



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: What's green, fuzzy and would kill you if it fell, Generated Joke: What's green, fuzzy and would kill you if it fell out of a tree? <|sep|> A pool table. <|endofjoke|>



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: Why can't you play Uno with a Mexican? <|sep|> They, Generated Joke: Why can't you play Uno with a Mexican? <|sep|> They don't have Juan. <|endofjoke|>

Input: How much room is needed for fungi to grow? <|sep|>, Generated Joke: How much room is needed for fungi to grow? <|sep|> I don't know, but I'm sure they'll grow it. <|endofjoke|>



# As we can see the GPT2 based model generates much better jokes than the custom decoder model.