In [1]:
# Install required libraries
#!pip install torch
#!pip install transformers
#!pip install pandas

In [4]:
import pandas as pd

# Load the dataset with latin1 encoding
df = pd.read_csv('hp_script.csv', encoding='latin1')

# Check the columns and sample data
print(df.columns)
print(df.head())


Index(['ID_number', 'scene', 'character_name', 'dialogue'], dtype='object')
   ID_number  scene      character_name  \
0          1      1    Albus Dumbledore   
1          2      1  Minerva McGonagall   
2          3      1    Albus Dumbledore   
3          4      1  Minerva McGonagall   
4          5      1    Albus Dumbledore   

                                            dialogue  
0  I should have known that you would be here, Pr...  
1  Good evening, Professor Dumbledore. Are the ru...  
2   I'm afraid so, Professor. The good, and the bad.  
3                                       And the boy?  
4                            Hagrid is bringing him.  


In [5]:
# Extract the dialogue column
dialogues = df['dialogue'].dropna().tolist()

# Save to a text file, one dialogue per line
with open('hp_dialogues.txt', 'w', encoding='utf-8') as f:
    for dialogue in dialogues:
        f.write(dialogue.strip() + '\n')

print(f"Total dialogues saved: {len(dialogues)}")



Total dialogues saved: 793


In [6]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Avoid padding issues

# Load the prepared text file
with open('hp_dialogues.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Tokenize dialogues
tokens = tokenizer(lines, return_tensors='pt', padding=True, truncation=True, max_length=128)

print(f"Sample tokenized input IDs shape: {tokens['input_ids'].shape}")



W0911 01:25:16.085000 13452 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Sample tokenized input IDs shape: torch.Size([793, 128])


In [10]:
import pandas as pd

# Load CSV with a more forgiving encoding
df = pd.read_csv('hp_script.csv', encoding='ISO-8859-1')

# Check the columns
print(df.columns)

# Extract dialogues, drop NaN values
dialogues = df['dialogue'].dropna().tolist()

# Save as plain text file
with open('movie_dialogs.txt', 'w', encoding='utf-8') as f:
    for line in dialogues:
        f.write(line.strip() + '\n')

print(f"Total dialogues saved: {len(dialogues)}")


Index(['ID_number', 'scene', 'character_name', 'dialogue'], dtype='object')
Total dialogues saved: 793


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import os

# Set device to CPU
device = torch.device('cpu')

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Avoid padding issues

model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Load preprocessed dialogues
with open('movie_dialogs.txt', 'r', encoding='utf-8') as f:
    dialogues = f.readlines()

# Dataset class
class DialogueDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.examples = [tokenizer(text.strip(), max_length=max_length, truncation=True, padding='max_length') for text in texts]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.examples[idx].items()}
        return item

# Create dataset and dataloader
dataset = DialogueDataset(dialogues, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop (light, just 1-3 epochs)
epochs = 2

model.train()
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 50 == 0:
            print(f"Step {i}, Loss: {loss.item()}")

# Save the fine-tuned model
output_dir = './gpt2-finetuned-movie-dialogs'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\nFine-tuning complete. Model saved to {output_dir}")




Epoch 1/2


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step 0, Loss: 9.260353088378906
Step 50, Loss: 0.38670554757118225
Step 100, Loss: 0.6881702542304993
Step 150, Loss: 0.642801821231842
Step 200, Loss: 0.6312360167503357
Step 250, Loss: 0.7301705479621887
Step 300, Loss: 1.1372640132904053
Step 350, Loss: 0.07480308413505554

Epoch 2/2
Step 0, Loss: 0.3388036787509918
Step 50, Loss: 2.857886791229248
Step 100, Loss: 0.2234956920146942
Step 150, Loss: 0.1591886579990387
Step 200, Loss: 1.3177851438522339
Step 250, Loss: 0.16637687385082245
Step 300, Loss: 1.10332190990448
Step 350, Loss: 0.4850304126739502

Fine-tuning complete. Model saved to ./gpt2-finetuned-movie-dialogs


In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

device = torch.device('cpu')  # Since you're using CPU

# Load base GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
base_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Load fine-tuned GPT-2
fine_tuned_model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-movie-dialogs').to(device)

# Sample prompt
prompt = "Harry Potter entered the room and said"

def generate_text(model, prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate text
print("\n--- Base GPT-2 Sample ---")
print(generate_text(base_model, prompt))

print("\n--- Fine-tuned GPT-2 Sample ---")
print(generate_text(fine_tuned_model, prompt))


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Base GPT-2 Sample ---
Harry Potter entered the room and said, "I'm sorry, but I'm not going to be able to do this."

Harry Potter said, "I'm sorry, but I'm not going to be able to do this."

Hermione Granger said, "I'm sorry, but I'm not going to be able to do this."

Harry Potter said, "I'm sorry, but I'm not going to be able to do this."

Hermione Granger

--- Fine-tuned GPT-2 Sample ---
Harry Potter entered the room and said, "Hello, Harry Potter. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to Hogwarts. Welcome to


In [13]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load base GPT-2
base_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
base_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
base_tokenizer.pad_token = base_tokenizer.eos_token

# Load fine-tuned GPT-2
finetuned_model_path = "./gpt2-finetuned-movie-dialogs"  # replace with your path
ft_tokenizer = GPT2Tokenizer.from_pretrained(finetuned_model_path)
ft_model = GPT2LMHeadModel.from_pretrained(finetuned_model_path).to(device)
ft_tokenizer.pad_token = ft_tokenizer.eos_token

# Prompts to test
prompts = [
    "Harry Potter entered the room and said,",
    "Hermione Granger whispered to Harry,"
]

# Function to generate text
def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Save results
with open("sample_generations.txt", "w", encoding="utf-8") as f:
    for prompt in prompts:
        f.write(f"--- Prompt: {prompt} ---\n\n")
        # Base GPT-2
        base_output = generate_text(base_model, base_tokenizer, prompt)
        f.write(f"Base GPT-2: {base_output}\n\n")
        # Fine-tuned GPT-2
        ft_output = generate_text(ft_model, ft_tokenizer, prompt)
        f.write(f"Fine-tuned GPT-2: {ft_output}\n\n")
        f.write("="*80 + "\n\n")

print("Sample generations saved to sample_generations.txt")


Sample generations saved to sample_generations.txt


In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

device = 'cpu'  # Since we're using CPU

# Load base GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Few-shot prompt example
prompt = (
    "Harry Potter: Hello, who are you?\n"
    "Hagrid: I am Hagrid, Keeper of Keys and Grounds at Hogwarts.\n"
    "Harry Potter: "
)

# Tokenize prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

# Generate output
output_ids = model.generate(
    input_ids,
    max_length=150,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\n--- Few-Shot Base GPT-2 Sample ---\n")
print(generated_text)


W0912 14:21:44.569000 10632 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Few-Shot Base GPT-2 Sample ---

Harry Potter: Hello, who are you?
Hagrid: I am Hagrid, Keeper of Keys and Grounds at Hogwarts.
Harry Potter:  I am Harry Potter. I'm Hagrind. You are Hagrod. Hagrin is Hagraven. And Hagrah is Hagrah. So, Hagro is a little bit of a bit like Hagrim. He's a very good boy. But he's not a good wizard. It's just that he doesn't know how to spell. That's why he can't spell, and he has to learn how not to. The only way he knows how is by being a wizard, which is to be a great wizard and a master of magic. If he


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

device = 'cpu'

# Load fine-tuned GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned-movie-dialogs')
model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-movie-dialogs').to(device)

# Few-shot prompt example
prompt = (
    "Harry Potter: Hello, who are you?\n"
    "Hagrid: I am Hagrid, Keeper of Keys and Grounds at Hogwarts.\n"
    "Harry Potter: "
)

input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

output_ids = model.generate(
    input_ids,
    max_length=150,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\n--- Few-Shot Fine-Tuned GPT-2 Sample ---\n")
print(generated_text)



--- Few-Shot Fine-Tuned GPT-2 Sample ---

Harry Potter: Hello, who are you?
Hagrid: I am Hagrid, Keeper of Keys and Grounds at Hogwarts.
Harry Potter:  I am Harry Potter.


In [None]:
#

✅ Key Observations:

Base GPT-2 Output: Generic, rambling, and sometimes irrelevant.

Fine-Tuned GPT-2 Output: Short, concise, and contextually appropriate (just like an actual movie script dialogue).

👉 This shows that fine-tuning has successfully specialized the GPT-2 model to better follow the Harry Potter domain.

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned-movie-dialogs')
model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-movie-dialogs').to('cpu')

def calculate_perplexity(text):
    encodings = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings['input_ids'])
    loss = outputs.loss
    return torch.exp(loss).item()

prompt = "Harry Potter entered the room and said,"
base_sample = "Harry Potter entered the room and said, 'I must find the stone.'"
fine_tuned_sample = "Harry Potter entered the room and said, 'Welcome to Hogwarts!'"

base_perplexity = calculate_perplexity(base_sample)
fine_tuned_perplexity = calculate_perplexity(fine_tuned_sample)

print(f"Base GPT-2 Perplexity: {base_perplexity}")
print(f"Fine-tuned GPT-2 Perplexity: {fine_tuned_perplexity}")


W0912 16:46:15.405000 4784 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Base GPT-2 Perplexity: 34.75503921508789
Fine-tuned GPT-2 Perplexity: 23.251800537109375


In [None]:
# Results:

Base GPT-2 model (before fine-tuning):
Perplexity ≈ 34.75

Fine-tuned GPT-2 model:
Perplexity ≈ 23.25

This significant drop in perplexity indicates the fine-tuned model became much better at predicting domain-specific text from Harry Potter scripts.