In [2]:
# Step 1: Parse movie_lines.txt
from google.colab import files
uploaded = files.upload()

id2line = {}
with open("movie_lines.txt", encoding="ISO-8859-1") as f:  # You can try changing this and see what changes in the response you get :)
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id = parts[0]
            text = parts[4]
            id2line[line_id] = text

# Step 2: Parse movie_conversations.txt into a list of conversations
conversations = []
with open("movie_conversations.txt", encoding="ISO-8859-1") as f:  # Make sure to keet encodings consistent
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            try:
                utterance_ids = eval(parts[3])  # Converts string list to actual list
                conversations.append(utterance_ids)
            except Exception as e:
                print(f"Skipping line due to eval error: {e}")

# Step 3: Build input-output pairs (prompt-response)
pairs = []
context_length = 4  # or 3 or 5 — how many past turns you want to use

for conv in conversations:
    for i in range(context_length, len(conv)):
        if all(utt in id2line for utt in conv[i - context_length:i + 1]):
            context_lines = [id2line[utt].strip() for utt in conv[i - context_length:i]]
            response_line = id2line[conv[i]].strip()

            # Make sure there’s actual text
            if all(context_lines) and response_line:
                input_text = tokenizer.eos_token.join(context_lines) + tokenizer.eos_token
                pairs.append((input_text, response_line))



print(f"Loaded {len(pairs)} dialog pairs.")


Saving movie_lines.txt to movie_lines.txt
Saving movie_conversations.txt to movie_conversations.txt


NameError: name 'tokenizer' is not defined

In [None]:
import random

# Parameters
SAMPLE_SIZE = 5_000      # how many pairs you want, you can change it
RANDOM_SEED = 42          # set this if you need deterministic sampling

# Draw the sample
random.seed(RANDOM_SEED)          # comment this out for a fresh shuffle each run
sample_pairs = random.sample(pairs, SAMPLE_SIZE)

print(f"Sampled {len(sample_pairs)} pairs.")

In [None]:
from datasets import Dataset

# Create a Hugging Face Dataset from your list of (input, output) pairs
data = [{"input": q, "output": a} for q, a in sample_pairs]
hf_dataset = Dataset.from_list(data)

print(hf_dataset[0])  # sanity check


In [None]:
# If not installed, uncomment the code and install
# !pip install transformers
import transformers
print(transformers.__version__)



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token  # Fix the pad token issue

def tokenize(example):
    full_text = example["input"] + example["output"] + tokenizer.eos_token
    tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# def tokenize(example):
#     input_text = example["input"] + tokenizer.eos_token
#     output_text = example["output"] + tokenizer.eos_token
#     full_text = input_text + output_text
#     tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=256)
#     tokens["labels"] = tokens["input_ids"].copy()  # Causal language modeling
#     return tokens

tokenized_dataset = hf_dataset.map(tokenize, batched=False)


In [None]:
# If error persists, uncomment this code, execute it and run below snippet again
# !pip install -U transformers


In [None]:
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
# Need to import get_last_checkpoint from the trainer_utils module
from transformers.trainer_utils import get_last_checkpoint


# 1. Detect an existing checkpoint (if any)

output_dir = "./dialogpt-finetuned"
last_ckpt  = get_last_checkpoint(output_dir) if os.path.isdir(output_dir) else None
if last_ckpt:
    print(f"  Found checkpoint at: {last_ckpt} – resuming from there.")


# 2. Load model (fresh or from checkpoint)

model_name_or_path = last_ckpt or "microsoft/DialoGPT-medium"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

# (Optional but tidy) – make sure pad token is set

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
tokenizer.pad_token = tokenizer.eos_token


training_args = TrainingArguments(
    output_dir           = output_dir,
    per_device_train_batch_size = 4,
    num_train_epochs     = 2,
    dataloader_num_workers = 4,

    # logging & checkpointing
    logging_strategy     = "steps",
    logging_steps        = 200,
    save_strategy        = "steps",
    save_steps           = 500,
    save_total_limit     = 2,

    # misc
    fp16                 = True,     # comment out if GPU doesn’t support fp16
    report_to            = "none",   # no WandB/HF Hub logging
)


# 4. Trainer
trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = tokenized_dataset,
    tokenizer     = tokenizer,  # keeps pad/eos alignment neat
)


# 5. Train – resume if we have a checkpoint
trainer.train(resume_from_checkpoint=last_ckpt)

In [None]:
# Saving the freshly trained moel and its tokeniser
trainer.save_model("./dialogpt-finetuned/final")
tokenizer.save_pretrained("./dialogpt-finetuned/final")


In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# # Load the fine-tuned model
# model_path = "./dialogpt-finetuned/final"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model     = AutoModelForCausalLM.from_pretrained(model_path)
# model.eval()


# chat_history_ids = []

# while True:
#     user_input = input("You: ")
#     if user_input.lower() in ["exit", "quit"]:
#         break

#     # Encode user input + eos
#     new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

#     # Append user input to chat history
#     chat_history_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids != [] else new_user_input_ids

#     # Generate a response
#     output_ids = model.generate(
#         chat_history_ids,
#         max_length=chat_history_ids.shape[-1] + 100,
#         pad_token_id=tokenizer.eos_token_id,
#         do_sample=True,
#         top_k=50,
#         top_p=0.95,
#         temperature=0.9,
#     )

#     # Extract and print only the new part of the reply
#     new_tokens = output_ids[:, chat_history_ids.shape[-1]:]
#     response = tokenizer.decode(new_tokens[0], skip_special_tokens=True)
#     new_bot_input_ids = tokenizer.encode(response + tokenizer.eos_token, return_tensors='pt')
#     chat_history_ids = torch.cat([chat_history_ids, new_bot_input_ids], dim=-1)
#     print(chat_history_ids)
#     print("Generated token IDs:", new_tokens)
#     print("Decoded raw text:", tokenizer.decode(new_tokens[0]))

#     print(f"Bot: {response}")
#     # Keep only last 3 messages (adjust as needed)
#     if len(chat_history_ids) > 3:
#         chat_history_ids = chat_history_ids[-4:]
#     # Append bot reply to chat history (with eos!)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model
model_path = "./dialogpt-finetuned/final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()

chat_history_ids = None  # Use None instead of empty list for cleaner checks

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        break

    # Encode user input + <|endoftext|> token
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    # Append user input to chat history
    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids

    # Generate a response
    output_ids = model.generate(
        bot_input_ids,
        max_length=bot_input_ids.shape[-1] + 100,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
    )
    print("output_ids.shape:", output_ids.shape)
    print("bot_input_ids.shape:", bot_input_ids.shape)


    # Get new tokens (model reply only)
    new_tokens = output_ids[:, bot_input_ids.shape[-1]:]
    response = tokenizer.decode(new_tokens[0], skip_special_tokens=True)
    print(chat_history_ids)

    print(f"Bot: {response}")

    # Append the *generated tokens* directly to history (not re-encoded)
    chat_history_ids = torch.cat([bot_input_ids, new_tokens], dim=-1)

    # Optional: Limit history size to avoid going beyond context window
    if chat_history_ids.shape[-1] > 1000:  # You can adjust this threshold
        chat_history_ids = chat_history_ids[:, -1000:]
    print("Generated token IDs:", new_tokens)
    print("Decoded raw text:", tokenizer.decode(new_tokens[0]))


# **This is the snippet to use Dialogpt-medium, for those who are stuck try running this cell first to get a clearer idea about how to proceed and code**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model     = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium").eval()

chat_history = []

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]: break

    new_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
    bot_ids = torch.cat(chat_history + [new_ids], dim=-1) if chat_history else new_ids

    generated_ids = model.generate(
        bot_ids,
        max_length=bot_ids.shape[-1] + 100,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_k=40,
        top_p=0.9,
    )

    reply = tokenizer.decode(generated_ids[:, bot_ids.shape[-1]:][0],
                             skip_special_tokens=True)
    print(f"Bot: {reply}")
    chat_history.append(new_ids)
