## GPT2 Fine-tuning

In [2]:
import enum
from logging import config
from random import shuffle
import tokenize
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
import os 
import nltk 
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from colorama import Style, Fore
import time
import torch

In [67]:
def prepare_spam_data(debug = False, batch_size = 2):
    df = pd.read_csv(os.path.join("dataset", "spam.csv"), encoding='latin1')
    df_ham = df[df['v1'] == 'ham'][['v2']]
    '''
    It handles punctuation intelligently (e.g., separating punctuation from words).
    It accounts for contractions (e.g., "don't" is split into ["do", "n't"]).
    '''
    # df_ham['sentence'] = df_ham['v2'].apply(lambda sentence: nltk.word_tokenize(sentence))
    df_ham['sentence'] = df_ham['v2']
    df_ham.drop(columns = ['v2'], axis = 1, inplace = True)
    df_ham['length'] = df_ham['sentence'].apply(lambda x: len(x))
    if debug:
        sns.histplot(data = df_ham, x = 'length', kde = True)
        plt.xlabel('Length of sentence.')
        plt.savefig(os.path.join('plots', 'SentenceLength.png'))
        plt.show()
    class CustomDataset(Dataset):
        def __init__(self, txt_list, tokenizer, max_length = 128):
            self.input_ids, self.attn_masks = [], []
            for txt in txt_list:
                encoding_dict = tokenizer('<|start|>' + txt + '<|end|>', \
                                          truncation = True, \
                                          padding = 'max_length', \
                                          max_length = max_length)
                self.input_ids.append(encoding_dict['input_ids'])
                self.attn_masks.append(encoding_dict['attention_mask'])
        def __len__(self):
            return len(self.input_ids)
        def __getitem__(self, idx):
            return self.input_ids[idx], self.attn_masks[idx]
    '''
        In GPT-2, the default value for tokenizer.bos_token_id is None 
        because GPT-2 does not use a beginning-of-sequence (BOS) token by default.
    '''
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|start|>', \
                                              eos_token='<|end|>', pad_token='<|pad|>')
    print(f'{Fore.CYAN}[TOKNIZR] Mx model length: {tokenizer.model_max_length}. '+\
          f'GPT small: 768{Style.RESET_ALL}')
    print(f'{Fore.CYAN}[TOKNIZR] Beginning of seq: {tokenizer.decode(tokenizer.bos_token_id)}, '+\
          f'has token id: {tokenizer.bos_token_id}{Style.RESET_ALL}')
    print(f'{Fore.CYAN}[TOKNIZR] End of seq: {tokenizer.decode(tokenizer.eos_token_id)}, '+\
          f'has token id: {tokenizer.eos_token_id}{Style.RESET_ALL}')
    print(f'{Fore.CYAN}[TOKNIZR] Padding of seq: {tokenizer.decode(tokenizer.pad_token_id)}, '+\
          f'has token id: {tokenizer.pad_token_id}{Style.RESET_ALL}')
    dataset = CustomDataset(df_ham['sentence'], tokenizer)
    tr_size = int(0.95 * len(dataset))
    train_dataset, validation_dataset = random_split(dataset, [tr_size, len(dataset) - tr_size])
    train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size)
    validation_loader = DataLoader(validation_dataset, shuffle = False, batch_size = batch_size)
    return train_loader, validation_loader, tokenizer

In [85]:
def finetune(model, train_loader, validation_loader):
    epochs = 5
    learning_rate = 5e-4
    warmup_steps = 1e2
    epsilon = 1e-8
    total_steps = len(train_loader) * epochs
    optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)
    '''
        1. Gradually increases the learning rate from 0 to the initial maximum value over num_warmup_step
           This prevents abrupt large updates at the start of training when weights are randomly initialized.
        2. After the warm-up steps, the learning rate linearly decreases to 0 over the remaining num_training_steps
    '''
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = warmup_steps,
                                                num_training_steps = total_steps)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Selects device
    model = model.to(device)
    tbegin = time.time()
    stats = {"training_loss": [], "validation_loss": []}
    for epoch in range(epochs):
        print(f'======== Epoch {1+epoch} / {epochs} ========')
        # ========================================
        #               Training
        # ========================================
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for batch in train_loader:
            token = torch.stack(batch[0], dim = 0).permute(1, 0)
            attn = torch.stack(batch[1], dim = 0).permute(1, 0)
            b_input_ids = token.to(device)
            b_labels = token.to(device)
            b_masks = attn.to(device)
            model.zero_grad()
            outputs = model(b_input_ids, labels = b_labels, \
                            attention_mask = b_masks, token_type_ids = None)
            loss = outputs.loss 
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        avg_loss = total_train_loss / len(train_loader)
        print(f'Time: {round(time.time() - t0, 3)} sec, Avg Loss: {round(avg_loss, 4)}')
        stats["training_loss"].append(avg_loss)
        # ========================================
        #               Validation
        # ========================================
        t0 = time.time()
        total_val_loss = 0
        model.eval()
        for batch in validation_loader:
            token = torch.stack(batch[0], dim = 0).permute(1, 0)
            attn = torch.stack(batch[1], dim = 0).permute(1, 0)
            b_input_ids = token.to(device)
            b_labels = token.to(device)
            b_masks = attn.to(device)
            with torch.no_grad():
                outputs = model(b_input_ids, labels = b_labels, \
                                attention_mask = b_masks, token_type_ids = None)
                total_val_loss += outputs.loss.item()
        avg_loss = total_val_loss / len(validation_loader)
        print(f'Time: {round(time.time() - t0, 3)} sec, Avg Loss: {round(avg_loss, 4)}')
        stats["validation_loss"].append(avg_loss)
    print(f'------ Training Completed ------ {round(time.time() - tbegin), 3} sec.')


In [86]:
train_loader, validation_loader, tokenizer = prepare_spam_data()
configurations = GPT2Config.from_pretrained('gpt2', output_hidden_states = False)
print(f'{Fore.GREEN}[GPT2]:\n{configurations}{Style.RESET_ALL}')
'''
When you load a model using GPT2LMHeadModel.from_pretrained('gpt2', config=configurations), the model weights are not automatically saved to disk. 
Instead, the model is loaded into memory and available as a Python object (model). 
The source of the weights depends on the context:
'''
model = GPT2LMHeadModel.from_pretrained('gpt2', config = configurations)
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[36m[TOKNIZR] Mx model length: 1024. GPT small: 768[0m
[36m[TOKNIZR] Beginning of seq: <|start|>, has token id: 50257[0m
[36m[TOKNIZR] End of seq: <|end|>, has token id: 50258[0m
[36m[TOKNIZR] Padding of seq: <|pad|>, has token id: 50259[0m
[32m[GPT2]:
GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sampl

Embedding(50260, 768)

In [None]:
# ========================================
#               Fine Tune
# ========================================
finetune(model, train_loader, validation_loader)





## Display statistics & model info.

In [None]:
sns.set(style='darkgrid')
stats_df = pd.DataFrame(stats)

plt.figure(figsize=(10, 6))
sns.lineplot(data = stats_df, x = range(stats_df.shape[0]), y = 'training_loss', \
             label = 'training_loss', color = 'blue', marker = 'o')
sns.lineplot(data = stats_df, x = range(stats_df.shape[0]), y = 'validation_loss', \
             label = 'validation_loss', color = 'red', marker = 's')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

In [None]:
params = list(model.named_parameters())
print(f'GPT2 model has {len(params)} named parameters.')
print('\n============ Embedding Layer ============')
'''
This is the word embedding table.

Shape:

50257: The size of the vocabulary. This number corresponds to the unique tokens (words, subwords, punctuation, etc.) that the model can recognize.
768: The dimensionality of the embedding space. Each token is represented as a 768-dimensional vector.
Purpose:

Maps token indices (integers) into dense vectors of size 768.
These vectors capture semantic and syntactic information about the tokens.
'''
'''
This is the positional embedding table.

Shape:

1024: The maximum sequence length that the model can handle. The model can process sequences up to 1024 tokens long.
768: The dimensionality of the embedding space, matching the word embedding size.
Purpose:

Provides positional information to the model by assigning a unique embedding to each position in the sequence.
Since Transformers lack inherent sequence ordering (unlike RNNs), positional embeddings are added to the word embeddings to encode the order of tokens.
'''
for p in params[:2]:
    print(f'{p[0]} {str(tuple(p[1].size()))}')
print('\n============ First Transformer ============')
for p in params[2:14]:
    print(f'{p[0]} {str(tuple(p[1].size()))}')
print('\n============ Output Layer ============')
for p in params[-2:]:
    print(f'{p[0]} {str(tuple(p[1].size()))}')

## Saving & loading fine-tuned model.

In [None]:
import os 
output_dir = './save_model/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
!ls -l --block-size=M ./save_model/

## Generate Text

In [8]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
import os
import torch

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained(os.path.join('./save_model/'))
configuration = GPT2Config.from_pretrained(os.path.join('./save_model/', 'config.json'), output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained(os.path.join('./save_model/', 'pytorch_model.bin'), config = configuration)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
prompt = "The sun will not shine"
prompt_tokens = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)
'''
1. do_sample = True
Purpose: Enables sampling instead of deterministic (greedy or beam search) decoding.
Effect: The model samples tokens from the probability distribution, introducing randomness. This is essential for generating diverse and creative outputs.
Use case: Creative tasks like story generation or poetry, where multiple plausible continuations are possible.

2. top_k = 10
Purpose: Activates Top-K sampling, which limits the model to consider only the top k tokens with the highest probabilities for the next word.
Effect: Prevents low-probability (and often nonsensical) tokens from being sampled.
Value (10): The model will choose the next token from the top 50 most probable tokens.

3. max_length = 128
Purpose: Sets the maximum length of the generated sequence, including the input prompt.
Effect: Ensures that the output sequence does not exceed 300 tokens. This is useful to control the length of the output for applications like summaries, responses, or articles.
Caution: If the model reaches max_length without completing a meaningful sequence, the output might feel truncated unless eos_token_id is defined.

4. top_p = 0.95
Purpose: Activates Top-P (nucleus) sampling. Instead of selecting tokens based solely on top_k, it considers the smallest set of tokens whose cumulative probability exceeds top_p.
Effect: Combines flexibility and control, ensuring the model samples from high-probability tokens while allowing more diversity than strict Top-K sampling.
Value (0.95): Tokens are sampled until their cumulative probability is 95%, allowing for diversity without overly random choices.

5. num_return_sequences = 3
Purpose: Specifies the number of different sequences to generate for the same input.
Effect: Produces multiple outputs, which can be useful for selecting the best one or exploring different plausible continuations.
Value (3): Generates 3 distinct sequences.
'''
sample_outputs = model.generate(prompt_tokens,\
                                do_sample = True,\
                                top_k = 10,\
                                max_length = 128,\
                                top_p = 0.95,\
                                num_return_sequences = 3)
for i, sample_output in enumerate(sample_outputs):
    print(f'==== [{i}] ====:\n {tokenizer.decode(sample_output, skip_special_tokens=True)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


==== [0] ====:
 The sun will not shine through this window, and it will never be bright enough for me, and for everyone who watches me, I will be lost forever.

The sun will not shine through this window, and it will never be bright enough for me, and for everyone who watches me, I will be lost forever. The world will never stop.

The world will never stop. I have been told that I have been chosen, that I can not be saved.

I have been told that I can not be saved. I am told that it is impossible for me to save this world.

I am
==== [1] ====:
 The sun will not shine on this land, and there will be no moon to look at." (L. G. Wells, On the Origin of the Bible, p. 394)

"The earth is in a state of decay, and the sky is falling down, and there will be no moon to look at. It will be like a cloud in a lake, which will fall, and the sun will not shine on it." (L. G. Wells, On the Origin of the Bible, p. 398)

"The earth will be in a state of decay, and there will be no moon to
==== [2] ====