In [1]:
pip install transformers torch datasets

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
!pip install colorama
!pip install -U transformers


Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.1
    Uninstalling transformers-4.53.1:
      Successfully uninstalled transformers-4.53.1
Successfully installed transformers-4.53.2


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

class DialogueGenerator:
    def __init__(self, model_ref="gpt2"):
        try:
            self.tokenizer = GPT2Tokenizer.from_pretrained(model_ref)
            self.model = GPT2LMHeadModel.from_pretrained(model_ref)
            self.model.eval()
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.model.to(self.device)
        except Exception as e:
            print(f"Error loading model: {e}")


In [6]:
!pip install transformers datasets pandas --quiet

import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer
import random

# ✅ Step 1: Load CSV using Pandas
df = pd.read_csv("data_iconic.csv")

# ✅ Step 2: Clean the data
df = df[df["text"].notnull()]
df = df[df["text"].str.strip() != ""]

# ✅ Step 3: Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text"]])

# ✅ Step 4: Augmentation
def augment_dialogue(example):
    text = example["text"]
    tweaks = [
        lambda t: t.lower().capitalize(),
        lambda t: t.replace(".", "..."),
        lambda t: "👉 " + t,
        lambda t: t + " 🎬",
        lambda t: t.replace("I", "i") if t.startswith("I ") else t,
    ]
    tweak = random.choice(tweaks)
    return {"text": tweak(text)}

augmented_dataset = dataset.map(augment_dialogue)

# ✅ Step 5: Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)

tokenized_dataset = augmented_dataset.map(tokenize_function, batched=True)

# ✅ Show one tokenized example
print(tokenized_dataset[0])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'text': '👉 I am Batman.', 'input_ids': [41840, 231, 314, 716, 9827, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [7]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer # Import GPT2Tokenizer

# Define tokenizer (copying from ShKHmLiC9Fj4)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


# ✅ Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Ensure tokenizer and model match

# ✅ Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling, since we're doing causal LM (GPT-2 style)
)

# ✅ Training arguments
training_args = TrainingArguments(
    output_dir="./batman_gpt2_dialogue_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=200,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Turn off wandb for now
)

# ✅ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset, # Assuming tokenized_dataset is available from a previous cell
    data_collator=data_collator,
)

# 🚀 Start training
trainer.train()

# 💾 Save final model
model.save_pretrained("batman_gpt2_dialogue_model")
tokenizer.save_pretrained("batman_gpt2_dialogue_model")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.1469
20,3.3934
30,3.2745
40,2.5881
50,2.4407
60,2.4435
70,2.0956


('batman_gpt2_dialogue_model/tokenizer_config.json',
 'batman_gpt2_dialogue_model/special_tokens_map.json',
 'batman_gpt2_dialogue_model/vocab.json',
 'batman_gpt2_dialogue_model/merges.txt',
 'batman_gpt2_dialogue_model/added_tokens.json')

In [None]:
import re
import random
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./batman_gpt2_dialogue_model")
tokenizer = GPT2Tokenizer.from_pretrained("./batman_gpt2_dialogue_model")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Cinematic emojis (controlled)
cinematic_emojis = ["🦇", "🎬", "🔥", "💥", "🃏", "🛡️", "😈"]

def clean_output(text):
    # Remove all non-alphabetic symbols except .?!,
    text = re.sub(r'[^a-zA-Z0-9\s\.\?\!]', '', text)
    # Remove duplicate spaces
    text = re.sub(r'\s{2,}', ' ', text)
    # Fix trailing cutoff
    if not text.endswith(('.', '?', '!')):
        text += '.'
    return text.strip()

# Input loop
while True:
    prompt = input("🎬 Enter your dialogue prompt (or 'exit' to quit): ")
    if prompt.lower() == "exit":
        break

    output = generator(
        prompt.strip().capitalize(),
        max_new_tokens=80,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.0,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )

    raw = output[0]["generated_text"]
    cleaned = clean_output(raw)


    print("\n🗣️ AI Dialogue:\n" + cleaned + " " )
    print("-" * 50)



Device set to use cpu



🗣️ AI Dialogue:
Im batmane hark. st. t t s s.t . 
--------------------------------------------------

🗣️ AI Dialogue:
Why so serious? I had to change the subject. I have the answers. I see. I am what I want to be. I just wanted to see . 
--------------------------------------------------

🗣️ AI Dialogue:
Your mission is to give us freedom. Im going to stop you you. I will not let you stand in my way. G.I. Joe. Dont call me Gwen bro. Youll never get me. Ill be back. 
--------------------------------------------------
