# GPT Fintuning - PROJECT

In [None]:
import torch
import transformers
print(torch.__version__)
print(transformers.__version__)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Set the seed for PyTorch (controls randomness for reproducibility)
seed = 1
torch.manual_seed(seed)

# Load pretrained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Encode input context to get token IDs
input_text = "PASTE A TITLE OF A POST FROM YOUR SUBREDDIT"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text using the model
output = model.generate(input_ids, 
                        do_sample=True, 
                        max_length=150, 
                        repetition_penalty=1.1,
                        temperature=.5, 
                        top_k=30, 
                        top_p=0.95
                        )

# Decode the generated IDs to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


🔔 **Question**: Does this output make sense? 

In [None]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
embeddings = model.transformer.wte.weight.detach().numpy()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# add as many words as needed
words = ["word1", "word2", "word3", "word4"]
word_indices = [tokenizer.encode(word)[0] for word in words]
selected_embeddings = embeddings[word_indices]

In [None]:
selected_embeddings

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Using PCA
pca = PCA(n_components=2)
reduced_embeddings_pca = pca.fit_transform(selected_embeddings)

In [None]:
import matplotlib.pyplot as plt
from adjustText import adjust_text

# Function to plot
def plot_embeddings(embeddings, labels, title='PCA of GPT-2 Embeddings'):
    plt.figure(figsize=(8, 6))
    texts = []
    points = plt.scatter(embeddings[:, 0], embeddings[:, 1])  # scatter plot of embeddings

    # Generate text annotations
    for i, label in enumerate(labels):
        x, y = embeddings[i]
        text = plt.text(x, y, label, ha='right', va='bottom', fontsize=9)
        texts.append(text)

    # Use adjust_text to avoid overlapping
    adjust_text(texts, x=embeddings[:, 0], y=embeddings[:, 1], arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

    plt.title(title)
    plt.grid(True)
    plt.show()

# Save plot
plt.savefig('gpt2_embeddings_pca.png', dpi=300, bbox_inches='tight')

# Example usage (assuming 'reduced_embeddings_pca' and 'words' are defined)
plot_embeddings(reduced_embeddings_pca, words, title='PCA of GPT-2 Embeddings')


<a id="ft"></a>

# Finetuning GPT-2

### ⚠️ Warning
Even though we are only training a small model, the following blocks of code operation will take long on a consumer-grade PC.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/YOUR_DATA.csv')

In [None]:
texts = df['selftext'].tolist()

In [None]:
texts

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Initialize tokenizer with padding token set
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize texts
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [None]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # For language modeling, the labels are the input_ids shifted by one
        item["labels"] = item["input_ids"].clone()
        return item

# Initialize the dataset
train_dataset = TextDataset(encodings)

## Commence Finetuning

In [None]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='../../results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir='../../logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

If you did run the previous code, make sure to save the model and finetuned tokenizer:

In [None]:
model_save_path = 'my_ft_model'
tokenizer_save_path = 'my_ft_tokenizer'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)


<a id="int"></a>
# Interpreting Model Output

Let's have a look at the ways DistilGPT2's behavior has been altered due to the finetuning on r/aita.

I uploaded my finetuned model to [HuggingFace](https://huggingface.co/tvannuenen/finetuned_model) so we can download it from there:

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model from the Hugging Face Hub
ft_tokenizer = GPT2Tokenizer.from_pretrained('my_ft_tokenizer')
ft_model = GPT2LMHeadModel.from_pretrained('my_ft_model')

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Set the seed for PyTorch (controls randomness for reproducibility)
seed = 1
torch.manual_seed(seed)

def generate_text(model, prompt, do_sample=True, max_length=50, temperature=1, top_k=50, top_p=0.95, repetition_penalty=1.1):
    """
    Generates text based on a given prompt using the specified model.
    
    Parameters:
    - model: The fine-tuned model to use for text generation.
    - prompt: The initial text to start generating from.
    - max_length: Maximum length of the generated text.
    - temperature: Sampling temperature for generating text.
    - top_k: The number of highest probability vocabulary tokens to keep for top-k filtering.
    - top_p: Nucleus sampling's cumulative probability cutoff to keep for top-p filtering.
    
    Returns:
    - generated_text: The generated text as a string.
    """
    # Encode the prompt text to tensor
    input_ids = ft_tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate a sequence of tokens following the prompt
    output_ids = ft_model.generate(input_ids, max_length=max_length, 
                                temperature=temperature, 
                                do_sample=do_sample, 
                                top_k=top_k, 
                                top_p=top_p, 
                                repetition_penalty=repetition_penalty, 
                                pad_token_id=ft_tokenizer.eos_token_id)
    
    # Decode the generated tokens to a string
    generated_text = ft_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_text

# Prompt to generate text from - play around with this!
prompt = "PASTE A TITLE FROM YOUR DATA"

# Generate texts
generated_text = generate_text(ft_model, prompt, max_length=150)
print("Generated text from finetuned model:", generated_text, '\n')


## Visualizing the Finetuned Model

In [None]:
embeddings = ft_model.transformer.wte.weight.detach().numpy()

# Use the same words you did on the pretrained model
words = ["word1", "word2", "word3", "word4", "word5"]
word_indices = [ft_tokenizer.encode(word)[0] for word in words]
selected_embeddings = embeddings[word_indices]

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Using PCA
pca = PCA(n_components=2)
reduced_embeddings_pca = pca.fit_transform(selected_embeddings)

In [None]:
import matplotlib.pyplot as plt
from adjustText import adjust_text

# Function to plot
def plot_embeddings(embeddings, labels, title='PCA of GPT-2 Embeddings'):
    plt.figure(figsize=(8, 6))
    texts = []
    points = plt.scatter(embeddings[:, 0], embeddings[:, 1])  # scatter plot of embeddings

    # Generate text annotations
    for i, label in enumerate(labels):
        x, y = embeddings[i]
        text = plt.text(x, y, label, ha='right', va='bottom', fontsize=9)
        texts.append(text)

    # Use adjust_text to avoid overlapping
    adjust_text(texts, x=embeddings[:, 0], y=embeddings[:, 1], arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

    plt.title(title)
    plt.grid(True)
    plt.savefig('finetuned_gpt2_embeddings_pca.png', dpi=300, bbox_inches='tight')
    plt.show()


# Example usage (assuming 'reduced_embeddings_pca' and 'words' are defined)
plot_embeddings(reduced_embeddings_pca, words, title='PCA of GPT-2 Embeddings')


# Create Posts Using Finetuned Model

In [None]:
import pandas as pd
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

# Load your data
df = pd.read_csv('../../data/YOUR_DATA.csv')

# Randomly select 10 entries
sample_df = df.sample(n=10, random_state=1)  # Use a fixed seed for reproducibility, if needed

original_texts = sample_df['selftext'].tolist()  # Adjust column name if different
titles = sample_df['title'].tolist()

# Initialize the generation pipeline
generator = pipeline('text-generation', model=ft_model, tokenizer=ft_tokenizer, device=-1)  # CPU usage

# Prepare to generate texts
generated_texts = []
for title, original_text in zip(titles, original_texts):
    # Calculate the length of the original post in tokens
    target_length = len(ft_tokenizer.encode(original_text))

    # Generate a new post of the same length starting from the title
    # Ensure to set max_length to the length of the original post
    prompt = title
    generated = generator(prompt, max_length=target_length, num_return_sequences=1)[0]['generated_text']
    generated_texts.append(generated)