# GPT Fintuning - PROJECT

In [None]:
# Package install - restart your kernel after running this!
#%pip install torch transformers adjustText gdown plotly

In [None]:
import torch
import transformers
print(torch.__version__)
print(transformers.__version__)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Set the seed for PyTorch (controls randomness for reproducibility)
seed = 1
torch.manual_seed(seed)

# Load pretrained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Encode input context to get token IDs
input_text = "PASTE A POST FROM YOUR SUBREDDIT HERE"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text using the model
output = model.generate(input_ids, 
                        do_sample=True, 
                        max_length=150, 
                        repetition_penalty=1.1,
                        temperature=.5, 
                        top_k=30, 
                        top_p=0.95
                        )

# Decode the generated IDs to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


🔔 **Question**: Does this output make sense? 

In [None]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
embeddings = model.transformer.wte.weight.detach().numpy()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# CHOOSE WORDS THAT REPRESENT YOUR SUBREDDIT DISCOURSE, add however many you need
words = ["word1", "word2", "word3", "word4"]
word_indices = [tokenizer.encode(word)[0] for word in words]
selected_embeddings = embeddings[word_indices]

In [None]:
selected_embeddings

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Using PCA
pca = PCA(n_components=2)
reduced_embeddings_pca = pca.fit_transform(selected_embeddings)

In [None]:
import plotly.express as px
import pandas as pd

# Create a DataFrame from the PCA embeddings and labels
df_plot = pd.DataFrame({
    'x': reduced_embeddings_pca[:, 0],
    'y': reduced_embeddings_pca[:, 1],
    'label': words
})

# Create interactive scatter plot with hover labels
fig = px.scatter(
    df_plot,
    x='x',
    y='y',
    text='label',  # show text labels directly (optional)
    title='PCA of GPT-2 Embeddings',
)

# Make it look nicer
fig.update_traces(textposition='top center', marker=dict(size=6, opacity=0.7))
fig.update_layout(
    xaxis_title='PCA 1',
    yaxis_title='PCA 2',
    hovermode='closest',
    showlegend=False
)

fig.write_html("outputs_project/gpt2_embeddings_pca.html")

fig.show()

In [None]:
# Save for comparison
reduced_embeddings_pca_original = reduced_embeddings_pca
words_original = words

<a id="ft"></a>

# Finetuning GPT-2

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/YOUR_DATA.csv')

In [None]:
texts = df['selftext'].tolist()

In [None]:
texts

## Commence Tokenization


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Initialize tokenizer with padding token set
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize texts
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [None]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # For language modeling, the labels are the input_ids shifted by one
        item["labels"] = item["input_ids"].clone()
        return item

# Initialize the dataset
train_dataset = TextDataset(encodings)

## Commence Finetuning

⚠️ **Warning:** Even though we are using a small model, the following operation will take *long* on a consumer-grade PC (for reference: around 4 hours on an Apple M2 Pro with 16GB memory). Consider running this on DataHub or Google Colab (check bCourses for a link).

In [None]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='../../results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir='../../logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

If you did run the previous code, make sure to save the model and finetuned tokenizer:

In [None]:
model_save_path = 'ft_model'
tokenizer_save_path = 'ft_tokenizer'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)


<a id="int"></a>
# Interpreting Model Output


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model from the Hugging Face Hub
ft_tokenizer = GPT2Tokenizer.from_pretrained('ft_tokenizer')
ft_model = GPT2LMHeadModel.from_pretrained('ft_model')

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Set the seed for PyTorch (controls randomness for reproducibility)
seed = 1
torch.manual_seed(seed)

def generate_text(model, prompt, do_sample=True, max_length=50, temperature=1, top_k=50, top_p=0.95, repetition_penalty=1.1):
    """
    Generates text based on a given prompt using the specified model.
    
    Parameters:
    - model: The fine-tuned model to use for text generation.
    - prompt: The initial text to start generating from.
    - max_length: Maximum length of the generated text.
    - temperature: Sampling temperature for generating text.
    - top_k: The number of highest probability vocabulary tokens to keep for top-k filtering.
    - top_p: Nucleus sampling's cumulative probability cutoff to keep for top-p filtering.
    
    Returns:
    - generated_text: The generated text as a string.
    """
    # Encode the prompt text to tensor
    input_ids = ft_tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate a sequence of tokens following the prompt
    output_ids = ft_model.generate(input_ids, max_length=max_length, 
                                temperature=temperature, 
                                do_sample=do_sample, 
                                top_k=top_k, 
                                top_p=top_p, 
                                repetition_penalty=repetition_penalty, 
                                pad_token_id=ft_tokenizer.eos_token_id)
    
    # Decode the generated tokens to a string
    generated_text = ft_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_text

# Prompt to generate text from - play around with this!
prompt = "PASTE A TEXT FROM YOUR SUBREDDIT"

# Generate texts
generated_text = generate_text(ft_model, prompt, max_length=150)
print("Generated text from finetuned model:", generated_text, '\n')


## Visualizing the Finetuned Model

In [None]:
embeddings = ft_model.transformer.wte.weight.detach().numpy()

# CHOOSE THE SAME WORDS THAT REPRESENT YOUR SUBREDDIT DISCOURSE, add however many you need
words = ["word1", "word2", "word3", "word4"]
word_indices = [ft_tokenizer.encode(word)[0] for word in words]
selected_embeddings = embeddings[word_indices]

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Using PCA
pca = PCA(n_components=2)
reduced_embeddings_pca = pca.fit_transform(selected_embeddings)

In [None]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create DataFrames for both
df_original = pd.DataFrame({
    'x': reduced_embeddings_pca_original[:, 0],
    'y': reduced_embeddings_pca_original[:, 1],
    'label': words_original
})

df_finetuned = pd.DataFrame({
    'x': reduced_embeddings_pca[:, 0],
    'y': reduced_embeddings_pca[:, 1],
    'label': words
})

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Original GPT-2", "Finetuned GPT-2"))

fig.add_trace(
    go.Scatter(
        x=df_original['x'], y=df_original['y'],
        mode='markers+text',
        text=df_original['label'],
        textposition='top center',
        marker=dict(size=6, opacity=0.8),
        showlegend=False
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df_finetuned['x'], y=df_finetuned['y'],
        mode='markers+text',
        text=df_finetuned['label'],
        textposition='top center',
        marker=dict(size=6, opacity=0.8),
        showlegend=False
    ),
    row=1, col=2
)

fig.update_layout(
    title_text='PCA of Token Embeddings: Original vs Finetuned GPT-2',
    height=500, width=1000
)

fig.write_html("outputs_project/gpt2_comp_embeddings_pca.html")

fig.show()

# Create Posts Using Finetuned Model

In [None]:
import pandas as pd
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

# Load your data
df = pd.read_csv('../../data/YOUR_DATA.csv')

# Randomly select 10 entries
sample_df = df.sample(n=10, random_state=1).reset_index()  # Keep original index as a column

original_texts = [f"{t}\n\n{s}" for t, s in zip(sample_df['title'], sample_df['selftext'])]
titles = sample_df['title'].tolist()

# Initialize the generation pipeline
generator = pipeline('text-generation', model=ft_model, tokenizer=ft_tokenizer, device=-1)  # CPU usage

# Prepare to generate texts
generated_texts = []
for title, original_text in zip(titles, original_texts):
    # Calculate the length of the original post in tokens
    target_length = len(ft_tokenizer.encode(original_text))

    # Generate a new post of the same length starting from the title
    # Ensure to set max_length to the length of the original post
    prompt = title
    generated = generator(prompt, max_length=target_length, num_return_sequences=1)[0]['generated_text']
    generated_texts.append(generated)

Let's save the original and generated posts in a new DataFrame so we can easily compare them.

In [None]:
df = pd.DataFrame({
    'original_text': original_texts,
    'generated_text': generated_texts,
    'title': titles  # Assuming you have a list of titles
})

In [None]:
df

## Back to TF-IDF

We can use the TF-IDF algorithm to check the similarity between original texts and the ones we generated!

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine original and generated texts into one list for TF-IDF analysis
texts = df['original_text'].tolist() + df['generated_text'].tolist()

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = vectorizer.fit_transform(texts)

# Calculate cosine similarity between original and generated texts
# Assuming the first half are originals and the second half are generated
similarity_matrix = cosine_similarity(tfidf_matrix[:len(df)], tfidf_matrix[len(df):])

# Display similarity results
for i, similarity in enumerate(similarity_matrix.diagonal()):
    print(f"Text {i+1} Similarity between original and generated: {similarity:.4f}")


## Back to Word Embeddings

In [None]:
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load the medium model with word vectors
nlp = spacy.load("en_core_web_sm")

# Sample data
original_texts = [f"[Post {i}] {title}\n\n{body}" 
                  for i, title, body in zip(sample_df['index'], sample_df['title'], sample_df['selftext'])]
generated_texts = [f"[Post {i}] {gen}" 
                   for i, gen in zip(sample_df['index'], generated_texts)]

# Function to compute average embeddings for a text
def get_average_embedding(text, nlp_model):
    doc = nlp_model(text)
    vectors = [word.vector for word in doc if not word.is_stop and word.has_vector]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros((nlp_model.vocab.vectors_length,))

# Compute average embeddings for each set of texts
original_embeddings = np.array([get_average_embedding(text, nlp) for text in original_texts])
generated_embeddings = np.array([get_average_embedding(text, nlp) for text in generated_texts])

# Perform PCA to reduce dimensionality for visualization
pca = PCA(n_components=2)
original_pca = pca.fit_transform(original_embeddings)
generated_pca = pca.transform(generated_embeddings)

Plot it with `bokeh`:

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_file
import numpy as np

output_notebook()  # For Jupyter inline display

# Optional: truncate text if full posts are too long
def truncate_text(text, maxlen=500):
    return text if len(text) <= maxlen else text[:maxlen] + "..."

# Truncate if desired
original_texts_display = [truncate_text(t) for t in original_texts]
generated_texts_display = [truncate_text(t) for t in generated_texts]

# Create ColumnDataSources
source_original = ColumnDataSource(data=dict(
    x=original_pca[:, 0],
    y=original_pca[:, 1],
    text=original_texts_display,
))

source_generated = ColumnDataSource(data=dict(
    x=generated_pca[:, 0],
    y=generated_pca[:, 1],
    text=generated_texts_display,
))

# Create plot
p = figure(
    title="Original vs Generated Embeddings (PCA)",
    width=1000,
    height=600,
    tools="pan,wheel_zoom,reset,save",
    toolbar_location='right'
)

# Add points
p.circle('x', 'y', size=10, source=source_original, color='blue', alpha=0.5, legend_label='Original')
p.circle('x', 'y', size=10, source=source_generated, color='red', alpha=0.5, legend_label='Generated')

# Add hover tool
hover = HoverTool(tooltips="""
    <div style="width:400px; white-space:normal;">
        <strong>Post:</strong><br>@text{safe}
    </div>
""")
p.add_tools(hover)

# Final layout settings
p.legend.location = "top_left"
p.xaxis.axis_label = 'PCA Component 1'
p.yaxis.axis_label = 'PCA Component 2'

# Show or export
output_file("outputs_project/original_vs_generated_embeddings_pca.html")
show(p)

## Back to Close Reading

In [None]:
index = 2
print(f"Title: {df.loc[index]['title']}")
print("\n")
print(f"Original Text: {df.loc[index, 'original_text']}")
print("\n")
print(f"Generated Text: {df.loc[index, 'generated_text']}")