In [16]:
import pandas as pd

train = pd.read_csv("/kaggle/input/chatbot/train.csv")
val = pd.read_csv("/kaggle/input/chatbot/validation.csv")
test = pd.read_csv("/kaggle/input/chatbot/test.csv")

print("Train size:", len(train))
print("Validation size:", len(val))
print("Test size:", len(test))
print(train.head())
print(train.columns)
print(train.head())

import ast

def preprocess_dialogue(dialog_str):
    # Convert string representation of list → actual Python list
    turns = ast.literal_eval(dialog_str)
    # Strip whitespace and remove empty turns
    turns = [t.strip() for t in turns if t.strip()]
    # Join turns with <eos> token
    return " <eos> ".join(turns)

# Apply to train, validation, and test sets
train["processed"] = train["dialog"].apply(preprocess_dialogue)
val["processed"] = val["dialog"].apply(preprocess_dialogue)
test["processed"] = test["dialog"].apply(preprocess_dialogue)

# Preview
print(train["processed"].head())

from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Now tokenize
train_encodings = tokenizer(list(train["processed"]),
                            truncation=True,
                            padding=True,
                            max_length=128)
val_encodings = tokenizer(list(val["processed"]),
                          truncation=True,
                          padding=True,
                          max_length=128)
test_encodings = tokenizer(list(test["processed"]),
                           truncation=True,
                           padding=True,
                           max_length=128)

# Check a sample
print(train_encodings["input_ids"][0])

import torch

class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx])
        }

# Create datasets
train_dataset = ChatDataset(train_encodings)
val_dataset = ChatDataset(val_encodings)
test_dataset = ChatDataset(test_encodings)

from torch.utils.data import DataLoader

# Batch size
batch_size = 2  # you can increase if GPU allows

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check a batch
batch = next(iter(train_loader))
print(batch["input_ids"])
print(batch["attention_mask"])

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load DialoGPT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Set pad token (important for batching)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

from torch.optim import AdamW

# Optimizer for model parameters
optimizer = AdamW(model.parameters(), lr=5e-5)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from torch.nn import CrossEntropyLoss

# Number of epochs
epochs = 2  # start small to test
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # ignore padding

for epoch in range(epochs):
    model.train()  # set model to training mode
    total_loss = 0

    for batch in train_loader:
        # Move batch to device (GPU/CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Average Loss: {avg_loss}")














Train size: 11118
Validation size: 1000
Test size: 1000
                                              dialog                    act  \
0  ['Say , Jim , how about going for a few beers ...  [3 4 2 2 2 3 4 1 3 4]   
1  ['Can you do push-ups ? '\n " Of course I can ...          [2 1 2 2 1 1]   
2  ['Can you study with the radio on ? '\n ' No ,...            [2 1 2 1 1]   
3  ['Are you all right ? '\n ' I will be all righ...              [2 1 1 1]   
4  ['Hey John , nice skates . Are they new ? '\n ...    [2 1 2 1 1 2 1 3 4]   

                 emotion  
0  [0 0 0 0 0 0 4 4 4 4]  
1          [0 0 6 0 0 0]  
2            [0 0 0 0 0]  
3              [0 0 0 0]  
4    [0 0 0 0 0 6 0 6 0]  
Index(['dialog', 'act', 'emotion'], dtype='object')
                                              dialog                    act  \
0  ['Say , Jim , how about going for a few beers ...  [3 4 2 2 2 3 4 1 3 4]   
1  ['Can you do push-ups ? '\n " Of course I can ...          [2 1 2 2 1 1]   
2  ['Can you study

KeyboardInterrupt: 

In [17]:
# Step 1: Save the fine-tuned model and tokenizer

# Save the model
model.save_pretrained("my_finetuned_dialoGPT")

# Save the tokenizer
tokenizer.save_pretrained("my_finetuned_dialoGPT")

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [40]:
!pip install ipywidgets
from IPython.display import display
import ipywidgets as widgets
import torch

# Assume your model and tokenizer are already loaded
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

input_box = widgets.Text(
    value='',
    placeholder='Type your message here...',
    description='You:',
    disabled=False
)

output_area = widgets.Output()

def on_submit(sender):
    user_msg = input_box.value.strip()
    if not user_msg:
        return
    input_box.value = ''  # clear input box

    # Encode user input only (no memory)
    input_ids = tokenizer.encode(user_msg + tokenizer.eos_token, return_tensors="pt").to(device)

    # Generate a short response
    output_ids = model.generate(
        input_ids,
        max_length=20,                  # short reply
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=30,
        top_p=0.9,
        temperature=0.5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

    with output_area:
        print("You:", user_msg)
        print("Chatbot:", response)
        print()

input_box.on_submit(on_submit)
display(input_box, output_area)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




  input_box.on_submit(on_submit)


Text(value='', description='You:', placeholder='Type your message here...')

Output()