In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# df = pd.read_csv("data/train.csv").dropna(how="any")

# train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

dataset = load_dataset("daily_dialog")

train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

### Load pre-trained model

In [4]:
model_name = "microsoft/DialoGPT-small"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_function(examples):
    inputs = [" ".join(utterance) for utterance in examples["dialog"]]
    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=256)
    return tokenized_inputs

train_tokenized = train_data.map(tokenize_function, batched=True)
valid_tokenized = valid_data.map(tokenize_function, batched=True)
test_tokenized = test_data.map(tokenize_function, batched=True)

train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
valid_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [7]:
# class ConversationDataset(Dataset):
#     def __init__(self, tokenized_data):
#         self.tokenized_data = tokenized_data

#     def __len__(self):
#         return len(self.tokenized_data)

#     def __getitem__(self, idx):
#         item = self.tokenized_data.iloc[idx]
#         return {k: torch.tensor(v, dtype=torch.long) for k, v in item.items()}

In [8]:
# train_dataset = ConversationDataset(train_tokenized)
# test_dataset = ConversationDataset(test_tokenized)

train_loader = DataLoader(train_tokenized, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_tokenized, batch_size=8)

In [9]:
from torch.optim import AdamW
from transformers import get_scheduler

model.to(device)

optimizer = AdamW(params=model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [10]:
for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} - Training")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        progress_bar.set_postfix({'Training Loss': loss.item()})

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Average Training Loss = {avg_train_loss:.4f}")

    # Evaluation phase
    model.eval()
    total_eval_loss = 0
    progress_bar = tqdm(valid_loader, desc=f"Epoch {epoch+1} - Evaluation")

    with torch.no_grad():
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
            loss = outputs.loss

            total_eval_loss += loss.item()
            progress_bar.set_postfix({'Evaluation Loss': loss.item()})

    avg_eval_loss = total_eval_loss / len(valid_loader)
    print(f"Epoch {epoch+1}: Average Evaluation Loss = {avg_eval_loss:.4f}")

Epoch 1 - Training:   0%|          | 0/1390 [00:00<?, ?it/s]

Epoch 1: Average Training Loss = 1.4017


Epoch 1 - Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 1: Average Evaluation Loss = 1.2397


Epoch 2 - Training:   0%|          | 0/1390 [00:00<?, ?it/s]

Epoch 2: Average Training Loss = 1.2162


Epoch 2 - Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 2: Average Evaluation Loss = 1.2025


Epoch 3 - Training:   0%|          | 0/1390 [00:00<?, ?it/s]

Epoch 3: Average Training Loss = 1.1660


Epoch 3 - Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 3: Average Evaluation Loss = 1.1930


In [12]:
model.eval()

def chat_with_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    reply_ids = model.generate(**inputs, 
                               max_length=100, 
                               pad_token_id=tokenizer.eos_token_id, 
                               eos_token_id=tokenizer.eos_token_id, 
                               do_sample=True, 
                               top_k=50, 
                               top_p=0.9, 
                               temperature=0.7,
                               repetition_penalty=1.2)
    reply = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
    return reply

# Test the conversational agent
user_input = "How are you!"
response = chat_with_model(user_input)
print("Model:", response)

Model: How are you!   Pretty good.You â€™ re so cute! How old is your sister? You look like she could be twenty-one. I think my older brother looks forty and thirty when he was a kid, but younger than that now at least 20 years later in life... maybe not quite as young though actually ; how about yourself if it were me to ask for some advice on the matter here first. What do they say then after seeing this photo of our daughter's face
