# Question 2

# Finetuning GPT-2 

# Data Pre-processing

In [1]:
import json
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

import re
import html
import unicodedata

def clean_text(text):
    # 1. Normalize Unicode characters (Replace common Unicode characters)
    text = text.replace("\u201c", '"').replace("\u201d", '"').replace("\u2018", "'").replace("\u2019", "'")
    text = text.replace("\u2013", "-").replace("\u2026", "...").replace("\u2022", "*")  # Handle en dash, ellipsis, bullet points
    
    # 2. Decode HTML entities (e.g., "&amp;" becomes "&")
    text = html.unescape(text)
    
    # 3. Remove non-ASCII characters (if necessary, using a more aggressive approach)
    text = ''.join([c for c in text if ord(c) < 128])  # Keep only ASCII characters (lower-level cleaning)
    
    # 4. Remove unwanted symbols: Strip any non-letter, non-digit, non-punctuation characters
    text = re.sub(r'[^\w\s.,!?\'";:()-]', '', text)  # Keep common punctuation but remove others
    
    # 5. Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # 6. Remove leading and trailing spaces
    text = text.strip()
    
    # 7. Ensure proper sentence ending (optional, can be customized)
    if text and text[-1] not in ['.', '!', '?']:
        text += '.'  # Add period if no punctuation at the end

    return text


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to structure the jokes
def structure_jokes_from_json(file_path):
    """
    Structure jokes from the original JSON file containing 'title' and 'body'.
    Assumes title is the question/prompt and body is the punchline/response.
    
    Args:
    - file_path (str): Path to the original JSON file containing the jokes.
    
    Returns:
    - list of structured jokes in the form of [{"text": "title <|sep|> body <|endofjoke|>"}]
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    structured_jokes = []
    for entry in data:
        title = clean_text(entry["title"])
        body = clean_text(entry["body"])
        
        # Skip jokes that are too short
        if len(title.split()) < 3 or len(body.split()) < 3:
            continue
        
        # Structure joke as 'title <|sep|> body <|endofjoke|>'
        joke = f"{title} <|sep|> {body} <|endofjoke|>"
        structured_jokes.append({"text": joke})
    
    return structured_jokes

# Tokenize
Used the GPT2 tokenizer

Max_length used = 128

In [12]:
from datasets import Dataset
import torch
from transformers import GPT2Tokenizer

# Function to tokenize the jokes
def tokenize_data(dataset, tokenizer):
    """
    Tokenize the joke data for GPT-2 model.
    
    Args:
    - dataset (list): List of jokes to be tokenized.
    - tokenizer (GPT2Tokenizer): The GPT-2 tokenizer.
    
    Returns:
    - tokenized dataset
    """
    tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to be the end-of-text token

    def tokenize_function(examples):
        # Tokenize and ensure tensors are returned
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

    # Convert the list of structured jokes into a Dataset object from Hugging Face
    dataset = Dataset.from_list(dataset)
    
    # Tokenize the entire dataset
    return dataset.map(tokenize_function, batched=True, remove_columns=["text"])  # Remove original text column


# Driver Code

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
import torch
def fine_tune_gpt2(file_path, model_output_dir, epochs, batch_size, max_length):
    # Load the dataset and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    structured_jokes = structure_jokes_from_json(file_path)
    dataset = tokenize_data(structured_jokes, tokenizer)  # Ensure this returns tokenized dataset
    
    # Load the model
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    
    # Set the training arguments
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_dir="./logs",
        save_steps=1000,
        save_total_limit=2,
        prediction_loss_only=True,
    )
    
    from torch.utils.data import DataLoader
    import torch

    # Custom collate function to handle the dataset
    def collate_fn(batch):
        input_ids = torch.stack([torch.tensor(x['input_ids']) for x in batch])
        attention_mask = torch.stack([torch.tensor(x['attention_mask']) for x in batch])
        labels = input_ids.clone()  # Shift input_ids for causal language modeling

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=collate_fn,  # Use the collate function here
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer
    model.save_pretrained(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)

In [14]:
# Fine-tune GPT-2 using the original JSON file containing jokes
fine_tune_gpt2(
    file_path="../data/reddit_jokes.json",  # Path to your original JSON file
    model_output_dir="../gpt2-joke-model",  # Directory to save the fine-tuned model
    epochs=1,  # Number of training epochs
    batch_size=4,  # Batch size
    max_length=128  # Maximum token length
)

Map: 100%|██████████| 155493/155493 [00:25<00:00, 6206.44 examples/s]
                                         
  0%|          | 0/38874 [03:58<?, ?it/s]            

{'loss': 1.2326, 'grad_norm': 1.4769753217697144, 'learning_rate': 4.935689664042805e-05, 'epoch': 0.01}


                                         
  0%|          | 0/38874 [05:41<?, ?it/s]             

{'loss': 1.2066, 'grad_norm': 2.4955990314483643, 'learning_rate': 4.87137932808561e-05, 'epoch': 0.03}


                                         
  0%|          | 0/38874 [07:25<?, ?it/s]             

{'loss': 1.2018, 'grad_norm': 1.7053908109664917, 'learning_rate': 4.807068992128415e-05, 'epoch': 0.04}


                                         
  0%|          | 0/38874 [09:07<?, ?it/s]             

{'loss': 1.201, 'grad_norm': 1.6881585121154785, 'learning_rate': 4.74275865617122e-05, 'epoch': 0.05}


                                         
  0%|          | 0/38874 [10:51<?, ?it/s]             

{'loss': 1.1722, 'grad_norm': 2.1805477142333984, 'learning_rate': 4.678448320214025e-05, 'epoch': 0.06}


                                         
  0%|          | 0/38874 [12:32<?, ?it/s]             

{'loss': 1.1299, 'grad_norm': 2.1538000106811523, 'learning_rate': 4.61413798425683e-05, 'epoch': 0.08}


                                         
  0%|          | 0/38874 [14:13<?, ?it/s]             

{'loss': 1.1285, 'grad_norm': 1.8449656963348389, 'learning_rate': 4.549827648299635e-05, 'epoch': 0.09}


                                         
  0%|          | 0/38874 [15:54<?, ?it/s]             

{'loss': 1.1653, 'grad_norm': 1.6678621768951416, 'learning_rate': 4.48551731234244e-05, 'epoch': 0.1}


                                         
  0%|          | 0/38874 [17:36<?, ?it/s]             

{'loss': 1.148, 'grad_norm': 2.092423439025879, 'learning_rate': 4.421206976385245e-05, 'epoch': 0.12}


                                         
  0%|          | 0/38874 [19:17<?, ?it/s]             

{'loss': 1.15, 'grad_norm': 1.3931502103805542, 'learning_rate': 4.35689664042805e-05, 'epoch': 0.13}


                                         
  0%|          | 0/38874 [20:59<?, ?it/s]             

{'loss': 1.1354, 'grad_norm': 1.550413966178894, 'learning_rate': 4.292586304470855e-05, 'epoch': 0.14}


                                         
  0%|          | 0/38874 [22:40<?, ?it/s]             

{'loss': 1.1461, 'grad_norm': 0.9359981417655945, 'learning_rate': 4.2282759685136595e-05, 'epoch': 0.15}


                                         
  0%|          | 0/38874 [24:23<?, ?it/s]             

{'loss': 1.138, 'grad_norm': 1.2201206684112549, 'learning_rate': 4.163965632556465e-05, 'epoch': 0.17}


                                         
  0%|          | 0/38874 [26:06<?, ?it/s]             

{'loss': 1.0849, 'grad_norm': 1.0785069465637207, 'learning_rate': 4.0996552965992696e-05, 'epoch': 0.18}


                                         
  0%|          | 0/38874 [27:44<?, ?it/s]             

{'loss': 1.1096, 'grad_norm': 1.6145451068878174, 'learning_rate': 4.035344960642074e-05, 'epoch': 0.19}


                                         
  0%|          | 0/38874 [29:16<?, ?it/s]             

{'loss': 1.1336, 'grad_norm': 0.8259145021438599, 'learning_rate': 3.97103462468488e-05, 'epoch': 0.21}


                                         
  0%|          | 0/38874 [30:50<?, ?it/s]             

{'loss': 1.1306, 'grad_norm': 1.0573372840881348, 'learning_rate': 3.9067242887276844e-05, 'epoch': 0.22}


                                         
  0%|          | 0/38874 [32:22<?, ?it/s]             

{'loss': 1.1405, 'grad_norm': 0.967609703540802, 'learning_rate': 3.84241395277049e-05, 'epoch': 0.23}


                                         
  0%|          | 0/38874 [33:55<?, ?it/s]             

{'loss': 1.1241, 'grad_norm': 1.5707956552505493, 'learning_rate': 3.7781036168132945e-05, 'epoch': 0.24}


                                         
  0%|          | 0/38874 [35:27<?, ?it/s]              

{'loss': 1.0818, 'grad_norm': 0.9450043439865112, 'learning_rate': 3.713793280856099e-05, 'epoch': 0.26}


                                         
  0%|          | 0/38874 [37:00<?, ?it/s]              

{'loss': 1.1175, 'grad_norm': 1.4024978876113892, 'learning_rate': 3.6494829448989046e-05, 'epoch': 0.27}


                                         
  0%|          | 0/38874 [38:31<?, ?it/s]              

{'loss': 1.1142, 'grad_norm': 1.8105716705322266, 'learning_rate': 3.585172608941709e-05, 'epoch': 0.28}


                                         
  0%|          | 0/38874 [40:04<?, ?it/s]              

{'loss': 1.0848, 'grad_norm': 1.8111623525619507, 'learning_rate': 3.520862272984514e-05, 'epoch': 0.3}


                                         
  0%|          | 0/38874 [41:36<?, ?it/s]              

{'loss': 1.0697, 'grad_norm': 1.3567249774932861, 'learning_rate': 3.4565519370273194e-05, 'epoch': 0.31}


                                         
  0%|          | 0/38874 [43:09<?, ?it/s]              

{'loss': 1.0563, 'grad_norm': 1.0733556747436523, 'learning_rate': 3.392241601070124e-05, 'epoch': 0.32}


                                         
  0%|          | 0/38874 [44:41<?, ?it/s]              

{'loss': 1.1066, 'grad_norm': 1.3575106859207153, 'learning_rate': 3.327931265112929e-05, 'epoch': 0.33}


                                         
  0%|          | 0/38874 [46:14<?, ?it/s]              

{'loss': 1.1018, 'grad_norm': 1.4190309047698975, 'learning_rate': 3.263620929155734e-05, 'epoch': 0.35}


                                         
  0%|          | 0/38874 [47:46<?, ?it/s]              

{'loss': 1.0853, 'grad_norm': 1.3183685541152954, 'learning_rate': 3.199310593198539e-05, 'epoch': 0.36}


                                         
  0%|          | 0/38874 [49:19<?, ?it/s]              

{'loss': 1.1045, 'grad_norm': 1.1297410726547241, 'learning_rate': 3.1350002572413436e-05, 'epoch': 0.37}


                                         
  0%|          | 0/38874 [50:51<?, ?it/s]              

{'loss': 1.0681, 'grad_norm': 1.7300455570220947, 'learning_rate': 3.070689921284149e-05, 'epoch': 0.39}


                                         
  0%|          | 0/38874 [52:24<?, ?it/s]              

{'loss': 1.096, 'grad_norm': 1.666089653968811, 'learning_rate': 3.0063795853269537e-05, 'epoch': 0.4}


                                         
  0%|          | 0/38874 [53:56<?, ?it/s]              

{'loss': 1.0723, 'grad_norm': 1.381811499595642, 'learning_rate': 2.9420692493697588e-05, 'epoch': 0.41}


                                         
  0%|          | 0/38874 [55:28<?, ?it/s]              

{'loss': 1.0839, 'grad_norm': 1.5768322944641113, 'learning_rate': 2.8777589134125638e-05, 'epoch': 0.42}


                                         
  0%|          | 0/38874 [57:00<?, ?it/s]              

{'loss': 1.0776, 'grad_norm': 1.0329643487930298, 'learning_rate': 2.813448577455369e-05, 'epoch': 0.44}


                                         
  0%|          | 0/38874 [58:33<?, ?it/s]              

{'loss': 1.0427, 'grad_norm': 1.422408938407898, 'learning_rate': 2.7491382414981736e-05, 'epoch': 0.45}


                                         
  0%|          | 0/38874 [1:00:05<?, ?it/s]            

{'loss': 1.0634, 'grad_norm': 1.4973145723342896, 'learning_rate': 2.6848279055409786e-05, 'epoch': 0.46}


                                           
  0%|          | 0/38874 [1:01:38<?, ?it/s]            

{'loss': 1.0644, 'grad_norm': 1.6034085750579834, 'learning_rate': 2.6205175695837837e-05, 'epoch': 0.48}


                                           
  0%|          | 0/38874 [1:03:10<?, ?it/s]              

{'loss': 1.053, 'grad_norm': 1.7707070112228394, 'learning_rate': 2.5562072336265887e-05, 'epoch': 0.49}


                                           
  0%|          | 0/38874 [1:04:43<?, ?it/s]            

{'loss': 1.0732, 'grad_norm': 0.8504739999771118, 'learning_rate': 2.4918968976693934e-05, 'epoch': 0.5}


                                           
  0%|          | 0/38874 [1:06:15<?, ?it/s]            

{'loss': 1.049, 'grad_norm': 1.1779659986495972, 'learning_rate': 2.4275865617121985e-05, 'epoch': 0.51}


                                           
  0%|          | 0/38874 [1:07:48<?, ?it/s]            

{'loss': 1.0871, 'grad_norm': 1.1296643018722534, 'learning_rate': 2.3632762257550035e-05, 'epoch': 0.53}


                                           
  0%|          | 0/38874 [1:09:20<?, ?it/s]            

{'loss': 1.0512, 'grad_norm': 0.9055140018463135, 'learning_rate': 2.2989658897978082e-05, 'epoch': 0.54}


                                           
  0%|          | 0/38874 [1:10:53<?, ?it/s]            

{'loss': 1.0894, 'grad_norm': 1.0274111032485962, 'learning_rate': 2.2346555538406133e-05, 'epoch': 0.55}


                                           
  0%|          | 0/38874 [1:12:24<?, ?it/s]            

{'loss': 1.0758, 'grad_norm': 1.4623013734817505, 'learning_rate': 2.1703452178834183e-05, 'epoch': 0.57}


                                           
  0%|          | 0/38874 [1:13:57<?, ?it/s]            

{'loss': 1.0569, 'grad_norm': 1.0185227394104004, 'learning_rate': 2.1060348819262234e-05, 'epoch': 0.58}


                                           
  0%|          | 0/38874 [1:15:29<?, ?it/s]            

{'loss': 1.0686, 'grad_norm': 1.8561128377914429, 'learning_rate': 2.041724545969028e-05, 'epoch': 0.59}


                                           
  0%|          | 0/38874 [1:17:02<?, ?it/s]            

{'loss': 1.0346, 'grad_norm': 1.0581450462341309, 'learning_rate': 1.977414210011833e-05, 'epoch': 0.6}


                                           
  0%|          | 0/38874 [1:18:34<?, ?it/s]            

{'loss': 1.0685, 'grad_norm': 0.9050993323326111, 'learning_rate': 1.9131038740546382e-05, 'epoch': 0.62}


                                           
  0%|          | 0/38874 [1:20:07<?, ?it/s]            

{'loss': 1.0618, 'grad_norm': 0.9520531892776489, 'learning_rate': 1.848793538097443e-05, 'epoch': 0.63}


                                           
  0%|          | 0/38874 [1:21:39<?, ?it/s]            

{'loss': 1.0558, 'grad_norm': 0.9051138758659363, 'learning_rate': 1.784483202140248e-05, 'epoch': 0.64}


                                           
  0%|          | 0/38874 [1:23:12<?, ?it/s]            

{'loss': 1.0895, 'grad_norm': 1.3443220853805542, 'learning_rate': 1.720172866183053e-05, 'epoch': 0.66}


                                           
  0%|          | 0/38874 [1:24:44<?, ?it/s]            

{'loss': 1.0433, 'grad_norm': 1.8273134231567383, 'learning_rate': 1.655862530225858e-05, 'epoch': 0.67}


                                           
  0%|          | 0/38874 [1:26:16<?, ?it/s]            

{'loss': 1.0712, 'grad_norm': 0.8624415993690491, 'learning_rate': 1.5915521942686627e-05, 'epoch': 0.68}


                                           
  0%|          | 0/38874 [1:27:48<?, ?it/s]            

{'loss': 1.0609, 'grad_norm': 1.157494068145752, 'learning_rate': 1.5272418583114678e-05, 'epoch': 0.69}


                                           
  0%|          | 0/38874 [1:29:21<?, ?it/s]            

{'loss': 1.0657, 'grad_norm': 1.504683256149292, 'learning_rate': 1.4629315223542728e-05, 'epoch': 0.71}


                                           
  0%|          | 0/38874 [1:30:53<?, ?it/s]            

{'loss': 1.0418, 'grad_norm': 1.0027798414230347, 'learning_rate': 1.3986211863970777e-05, 'epoch': 0.72}


                                           
  0%|          | 0/38874 [1:32:26<?, ?it/s]            

{'loss': 1.0585, 'grad_norm': 1.1035927534103394, 'learning_rate': 1.3343108504398828e-05, 'epoch': 0.73}


                                           
  0%|          | 0/38874 [1:33:58<?, ?it/s]            

{'loss': 1.055, 'grad_norm': 1.3983980417251587, 'learning_rate': 1.2700005144826877e-05, 'epoch': 0.75}


                                           
  0%|          | 0/38874 [1:35:31<?, ?it/s]            

{'loss': 1.0327, 'grad_norm': 1.463358759880066, 'learning_rate': 1.2056901785254927e-05, 'epoch': 0.76}


                                           
  0%|          | 0/38874 [1:37:03<?, ?it/s]            

{'loss': 1.0587, 'grad_norm': 1.3753905296325684, 'learning_rate': 1.1413798425682977e-05, 'epoch': 0.77}


                                           
  0%|          | 0/38874 [1:38:35<?, ?it/s]            

{'loss': 1.0157, 'grad_norm': 1.3243924379348755, 'learning_rate': 1.0770695066111026e-05, 'epoch': 0.78}


                                           
  0%|          | 0/38874 [1:40:07<?, ?it/s]            

{'loss': 1.0829, 'grad_norm': 1.5162155628204346, 'learning_rate': 1.0127591706539077e-05, 'epoch': 0.8}


                                           
  0%|          | 0/38874 [1:41:40<?, ?it/s]            

{'loss': 1.0849, 'grad_norm': 1.1967121362686157, 'learning_rate': 9.484488346967126e-06, 'epoch': 0.81}


                                           
  0%|          | 0/38874 [1:43:12<?, ?it/s]            

{'loss': 1.0426, 'grad_norm': 1.53708016872406, 'learning_rate': 8.841384987395174e-06, 'epoch': 0.82}


                                           
  0%|          | 0/38874 [1:44:45<?, ?it/s]            

{'loss': 1.0481, 'grad_norm': 1.3926535844802856, 'learning_rate': 8.198281627823225e-06, 'epoch': 0.84}


                                           
  0%|          | 0/38874 [1:46:17<?, ?it/s]            

{'loss': 1.0857, 'grad_norm': 1.0185165405273438, 'learning_rate': 7.5551782682512745e-06, 'epoch': 0.85}


                                           
  0%|          | 0/38874 [1:47:50<?, ?it/s]            

{'loss': 1.0464, 'grad_norm': 0.9658646583557129, 'learning_rate': 6.912074908679324e-06, 'epoch': 0.86}


                                           
  0%|          | 0/38874 [1:49:21<?, ?it/s]            

{'loss': 1.0444, 'grad_norm': 1.1222537755966187, 'learning_rate': 6.268971549107373e-06, 'epoch': 0.87}


                                           
  0%|          | 0/38874 [1:50:54<?, ?it/s]            

{'loss': 1.0831, 'grad_norm': 0.928777277469635, 'learning_rate': 5.6258681895354226e-06, 'epoch': 0.89}


                                           
  0%|          | 0/38874 [1:52:26<?, ?it/s]            

{'loss': 1.0458, 'grad_norm': 1.2031396627426147, 'learning_rate': 4.982764829963471e-06, 'epoch': 0.9}


                                           
  0%|          | 0/38874 [1:53:59<?, ?it/s]            

{'loss': 1.0248, 'grad_norm': 1.236107349395752, 'learning_rate': 4.339661470391521e-06, 'epoch': 0.91}


                                           
  0%|          | 0/38874 [1:55:31<?, ?it/s]            

{'loss': 1.0585, 'grad_norm': 1.2515966892242432, 'learning_rate': 3.6965581108195706e-06, 'epoch': 0.93}


                                           
  0%|          | 0/38874 [1:57:04<?, ?it/s]            

{'loss': 1.0378, 'grad_norm': 1.3884307146072388, 'learning_rate': 3.0534547512476207e-06, 'epoch': 0.94}


                                           
  0%|          | 0/38874 [1:58:36<?, ?it/s]            

{'loss': 1.0566, 'grad_norm': 2.2116827964782715, 'learning_rate': 2.4103513916756703e-06, 'epoch': 0.95}


                                           
  0%|          | 0/38874 [2:00:09<?, ?it/s]            

{'loss': 1.0646, 'grad_norm': 1.098183274269104, 'learning_rate': 1.7672480321037198e-06, 'epoch': 0.96}


                                           
  0%|          | 0/38874 [2:01:40<?, ?it/s]            

{'loss': 1.0224, 'grad_norm': 1.3034898042678833, 'learning_rate': 1.1241446725317694e-06, 'epoch': 0.98}


                                           
  0%|          | 0/38874 [2:03:13<?, ?it/s]            

{'loss': 1.0358, 'grad_norm': 1.5755958557128906, 'learning_rate': 4.81041312959819e-07, 'epoch': 0.99}


                                           
100%|██████████| 38874/38874 [2:02:04<00:00,  5.31it/s]


{'train_runtime': 7324.2511, 'train_samples_per_second': 21.23, 'train_steps_per_second': 5.308, 'train_loss': 1.086230080627074, 'epoch': 1.0}


In [32]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("../gpt2-joke-model")
tokenizer = GPT2Tokenizer.from_pretrained("../gpt2-joke-model")

# Use the model
text = "What is the difference between"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_length = 100)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is the difference between a black man and a pizza? <|sep|> A pizza can feed a family of four. <|endofjoke|>


# The model generates offensive and racist jokes often, because it was trained on the reddit jokes only.