In [None]:
# Offline Chat-Reply Recommendation (GPT-2)
# - Loads processed context->reply pairs
# - Fine-tunes GPT-2 offline
# - Evaluates BLEU, ROUGE-L, Perplexity
# - Saves model + tokenizer + Model.joblib

import os, math, random, json
from pathlib import Path
import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import nltk

# Ensure NLTK data (punkt) is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [None]:
DATA_PATH = Path("/mnt/data/offline_chatbot/processed_conversations.csv")
assert DATA_PATH.exists(), f"Processed pairs not found at {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print("Total pairs:", len(df))
df.head()


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
len(train_df), len(val_df)


In [None]:
MODEL_NAME = "gpt2"  # must be preloaded offline in the environment
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
special_tokens = {"additional_special_tokens": ["<BOS>", "<EOS>", "<SEP>", "<USER_A>", "<USER_B>"]}
num_added = tokenizer.add_special_tokens(special_tokens)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def build_prompt(context: str):
    # Convert role tags to special tokens to help the model
    # Input example:
    # User B: ...
    # User A: ...
    ctx = context.replace("User A:", "<USER_A>:").replace("User B:", "<USER_B>:")
    return f"<BOS> {ctx}
<USER_A>: "  # we want the model to complete A's reply


In [None]:
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = build_prompt(row['context'])
        reply = row['reply'] + " <EOS>"
        # Full text is prompt + reply
        full_text = prompt + reply

        enc = self.tok(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = enc.input_ids[0]
        attention_mask = enc.attention_mask[0]

        # Build labels: ignore context tokens
        prompt_enc = self.tok(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        prompt_len = int(prompt_enc.attention_mask[0].sum().item())

        labels = input_ids.clone()
        # mask out loss for context
        labels[:prompt_len-0] = -100  # ignore context positions
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

train_ds = ChatDataset(train_df, tokenizer)
val_ds = ChatDataset(val_df, tokenizer)


In [None]:
OUT_DIR = Path("./gpt2_offline_chatbot")
OUT_DIR.mkdir(exist_ok=True, parents=True)

training_args = TrainingArguments(
    output_dir=str(OUT_DIR / "checkpoints"),
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=10,
    save_steps=200,
    evaluation_strategy="epoch",
    fp16=torch.cuda.is_available(),
    report_to=[],
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
)

trainer.train()
eval_metrics = trainer.evaluate()

eval_loss = eval_metrics.get('eval_loss', None)
perplexity = math.exp(eval_loss) if eval_loss is not None else None
print("Eval loss:", eval_loss, "Perplexity:", perplexity)


In [None]:
@torch.no_grad()
def generate_reply(context, max_new_tokens=64, temperature=0.7, top_p=0.9):
    prompt = build_prompt(context)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )
    gen_text = tokenizer.decode(out[0], skip_special_tokens=True)
    # Extract only the part after the last "<USER_A>:" as the reply
    if "<USER_A>:" in gen_text:
        reply = gen_text.split("<USER_A>:")[-1].strip()
    else:
        reply = gen_text
    return reply

def rouge_l_score(ref, hyp):
    # Simple ROUGE-L based on LCS length
    ref_tokens = ref.split()
    hyp_tokens = hyp.split()
    # LCS dynamic programming
    n, m = len(ref_tokens), len(hyp_tokens)
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(1, n+1):
        for j in range(1, m+1):
            if ref_tokens[i-1] == hyp_tokens[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    lcs = dp[n][m]
    prec = lcs / max(1, m)
    rec = lcs / max(1, n)
    if prec + rec == 0:
        return 0.0
    beta = (rec / (prec + 1e-8))
    fscore = (1 + beta**2) * (prec * rec) / (rec**2 + prec + 1e-8)
    return fscore

# Evaluate on validation set
smooth = SmoothingFunction().method1
bleus, rouges = [], []
samples = []

for _, row in val_df.iterrows():
    ctx, ref = row['context'], row['reply']
    hyp = generate_reply(ctx)
    # BLEU-1..4 using NLTK (default weights emphasize 4-gram; for short replies we use BLEU-2-ish smoothing)
    bleu = sentence_bleu([ref.split()], hyp.split(), smoothing_function=smooth, weights=(0.5, 0.5, 0, 0))
    rouge = rouge_l_score(ref, hyp)
    bleus.append(bleu)
    rouges.append(rouge)
    samples.append({"context": ctx, "ref": ref, "hyp": hyp})

metrics = {
    "val_bleu_mean": float(np.mean(bleus) if bleus else 0.0),
    "val_rougeL_mean": float(np.mean(rouges) if rouges else 0.0),
    "val_perplexity": float(perplexity) if perplexity is not None else None
}
metrics


In [None]:
from joblib import dump

SAVE_DIR = OUT_DIR / "artifact"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

model.save_pretrained(SAVE_DIR.as_posix())
tokenizer.save_pretrained(SAVE_DIR.as_posix())

with open(SAVE_DIR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(SAVE_DIR / "samples.jsonl", "w") as f:
    for s in samples:
        f.write(json.dumps(s, ensure_ascii=False) + "\n")

dump({
    "model_dir": SAVE_DIR.as_posix(),
    "model_name": "gpt2",
    "special_tokens": ["<BOS>", "<EOS>", "<SEP>", "<USER_A>", "<USER_B>"],
    "notes": "Causal LM fine-tuned for next-reply generation from context"
}, SAVE_DIR / "Model.joblib")

with open(SAVE_DIR / "ReadMe.txt", "w") as f:
    f.write(
"""
Offline Chat-Reply Recommendation (GPT-2)

Steps to run:
1) Ensure processed_conversations.csv exists (path set at top).
2) Run all cells to fine-tune GPT-2.
3) Metrics written to artifact/metrics.json; sample generations in artifact/samples.jsonl.
4) Exported model/tokenizer under artifact/ (use from_pretrained to reload).

Generation:
- Use generate_reply(context_str) to get User A's predicted reply.

Notes:
- Adjust WINDOW, max_length, epochs, and hyperparameters per dataset size.
- Perplexity is exp(eval_loss). BLEU/ROUGE-L are approximate indicators for dialogue.
""")
print("Artifacts saved to:", SAVE_DIR.as_posix())
