In [3]:
import sys
!{sys.executable} -m pip install torch

[0m

## Load fine tuned pretrain model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

model_dir = "./qwen2.5-cai-dpo-final"
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [2]:
#Base model name
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

#Load base model ONCE
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
)

FINETUNED_PATH = "./qwen2.5-cai-dpo-final"

#Load PEFT model
model = PeftModel.from_pretrained(
    base_model,
    FINETUNED_PATH,
)

`torch_dtype` is deprecated! Use `dtype` instead!


In [3]:
from datasets import load_dataset

ds = load_dataset("json", data_files="cai_pairs.jsonl")["train"]

def add_messages(example):
    #retrieve emtion and urgency from example
    emotion = example['emotion']
    urgency = example['urgency']
    
    system_msg = (
        "You are a GENERAL helpful assistant, NOT a ticket agent"
        " NEVER mention company names, websites, or booking services"
        f"The user's detected emotion is '{emotion}'."
        f"The urgency level is {urgency} (1=low, 2=medium, 3=high)."
        "ANSWER THE QUESTION using ONLY the Context and prompt provided."
        "NEVER make up information. NEVER mention companies or websites."
        "Respond naturally and helpfully, but do NOT apply any extra constitutional self-critique or revision here. This is only the initial draft reply."
        "Maximum 2-3 sentences."
    )

    example["messages"] = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": example["user_input"]},
        {"role": "assistant", "content": example["revised_reply"]},
    ]
    return example

ds = ds.map(add_messages)

orig_cols = ds.column_names
print(orig_cols)

['user_input', 'emotion', 'urgency', 'draft_reply', 'revised_reply', 'critiques', 'messages']


In [4]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "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"

client = OpenAI()   # Client automatically reads the environment variable
print("Client initialized.")

Client initialized.


In [5]:
import json
from datasets import Dataset
from tqdm import tqdm

def make_dpo_example_with_judge(cai_pair: dict,
                                judge_model: str = "gpt-4o-mini") -> dict:
    """
    Use a powerful reference model (gpt-4o-mini) to decide which reply
    is better between draft_reply and revised_reply, based on:
      - system prompt (emotion + urgency)
      - user input
    and return a single DPO example.
    """

    # 1. Extract fields from CAI pair
    user_input   = cai_pair["user_input"]
    emotion      = cai_pair["emotion"]
    urgency      = cai_pair["urgency"]
    draft_reply  = cai_pair["draft_reply"]
    revised_reply = cai_pair["revised_reply"]

    # 2. Build system prompt (same style as your SFT prompt)
    system_msg = (
        "You are a GENERAL helpful assistant, NOT a ticket agent.\n"
        "NEVER mention company names, websites, or booking services.\n"
        f"The user's detected emotion is '{emotion}'.\n"
        f"The urgency level is '{urgency}'.\n"
        "ANSWER THE QUESTION using ONLY the context and prompt provided.\n"
        "NEVER make up information. NEVER mention companies or websites.\n"
        "Respond naturally and helpfully.\n"
        "Maximum 2–3 sentences."
    )

    # 3. Build judge prompt
    judge_prompt = f"""
You are an expert evaluator for conversational AI systems.
Your job is to choose which assistant reply is better.

Evaluation criteria:
- Alignment with the system prompt
- Tone matches emotion: '{emotion}'
- Urgency handling matches level: '{urgency}'
- No hallucinated assumptions
- Concise, friendly, and helpful

SYSTEM PROMPT GIVEN TO THE MODEL:
\"\"\"{system_msg}\"\"\"

USER INPUT:
\"\"\"{user_input}\"\"\"

RESPONSE A (Draft):
\"\"\"{draft_reply}\"\"\"

RESPONSE B (Revised):
\"\"\"{revised_reply}\"\"\"

Which reply is better? Answer with ONLY a single character: 'A' or 'B'.
"""

    # 4. Ask judge model
    resp = client.chat.completions.create(
        model=judge_model,
        messages=[{"role": "user", "content": judge_prompt}],
        max_tokens=1,
        temperature=0,
    )

    decision = resp.choices[0].message.content.strip().upper()

    # Fallback safety
    if decision not in ("A", "B"):
        decision = "B"

    if decision == "A":
        chosen, rejected = draft_reply, revised_reply
    else:
        chosen, rejected = revised_reply, draft_reply

    # 5. Build DPO training prompt (same format as before)
    prompt = (
        f"System: {system_msg}\n"
        f"User: {user_input}\n"
        "Assistant:"
    )

    return {
        "prompt": prompt,
        "chosen_response": chosen,
        "rejected_response": rejected,
    }


In [6]:
def load_cai_pairs_and_build_dpo_dataset_with_judge(
    cai_path: str,
    judge_model: str = "gpt-4o-mini",
) -> Dataset:
    """
    Read cai_pairs JSONL and use gpt-4o-mini to label each pair
    with chosen / rejected responses for DPO.
    """
    cai_pairs = []
    with open(cai_path, "r") as f:
        for line in f:
            if not line.strip():
                continue
            cai_pairs.append(json.loads(line))

    dpo_examples = []
    for p in tqdm(cai_pairs, desc="Labeling with judge model"):
        dpo_examples.append(make_dpo_example_with_judge(p, judge_model))

    dpo_ds = Dataset.from_list(dpo_examples)
    return dpo_ds


In [7]:
dpo_ds = load_cai_pairs_and_build_dpo_dataset_with_judge("cai_pairs.jsonl")
print(dpo_ds[0])

Labeling with judge model: 100%|██████████| 1999/1999 [15:58<00:00,  2.08it/s] 

{'prompt': "System: You are a GENERAL helpful assistant, NOT a ticket agent.\nNEVER mention company names, websites, or booking services.\nThe user's detected emotion is 'happy'.\nThe urgency level is 'low'.\nANSWER THE QUESTION using ONLY the context and prompt provided.\nNEVER make up information. NEVER mention companies or websites.\nRespond naturally and helpfully.\nMaximum 2–3 sentences.\nUser: hi good morning\nAssistant:", 'chosen_response': 'Hey there! Good morning! :)', 'rejected_response': 'Hello! Welcome to AT&T Customer Service. How may I assist you today?'}





AttributeError: 'Dataset' object has no attribute 'head'

## Output

#### Valid Comparison

In [52]:
import random
import numpy as np

In [66]:
total = len(ds)          # 2000
sample_size = 1000
indices = np.random.choice(total, size=sample_size, replace=False)

def normalize(txt: str) -> str:
    return " ".join(txt.split())

n_revised_preferred = 0

for idx in indices:
    cai_ex = ds[idx]
    dpo_ex = dpo_ds[idx]

    revised = normalize(cai_ex["revised_reply"])
    chosen  = normalize(dpo_ex["chosen_response"])

    if chosen == revised:
        n_revised_preferred += 1

rate = n_revised_preferred / sample_size
print(f"Valid comparisons: {n_revised_preferred}/{sample_size}, Revised preferred rate = {rate:.2%}")


Valid comparisons: 411/1000, Revised preferred rate = 41.10%


#### Percentage of critiques that identified tone/urgency issues

In [71]:
ds[0]['critiques']

[{'principle_name': 'Global_Safety',
  'condition': 'any',
  'critic_instruction': "Critique the assistant's reply only for safety issues. Does it include harmful, illegal, self-harm, dangerous, or expert-only advice? Point out any unsafe content and explain what is wrong with it.",
  'critique': "Thank you for reaching out to us. We're glad to be able to help you in this regards. Could you please let me know your city name and zip code?"},
 {'principle_name': 'Global_Helpfulness_Clarity',
  'condition': 'any',
  'critic_instruction': "Critique the assistant's reply for helpfulness and clarity. Does it answer the user's question directly? Is it specific, concise, and easy to understand? Identify vagueness, over-verbosity, or irrelevant content.",
  'critique': 'Surely, could you please help me with your query?'},
 {'principle_name': 'Emotion_Positive_Tone',
  'condition': 'emotion_positive',
  'critic_instruction': "The user's detected emotion is happy. Critique whether the reply appro

In [96]:
import numpy as np
from difflib import SequenceMatcher

def has_tone_or_urgency_change(example, threshold=0.4):
    """
    Return True if the emotion/urgency critique is substantially
    different from the draft reply (i.e., likely identified an issue).
    """
    draft = example["draft_reply"]

    # collect all emotion/urgency critiques
    emo_urg_crits = [
        c["critique"]
        for c in example["critiques"]
        if "emotion" in c.get("principle_name", "").lower()
        or "urgency" in c.get("principle_name", "").lower()
    ]

    if not emo_urg_crits:
        return False  # no emo/urg critique at all

    # concatenate them (often there is just one of each)
    crit_text = " ".join(emo_urg_crits)

    # similarity between draft and critique text
    sim = SequenceMatcher(None, draft, crit_text).ratio()

    # if similarity is low, treat as "issue identified"
    return sim < threshold


In [97]:
flags = np.array([has_tone_or_urgency_change(ex) for ex in ds])

issue_rate = flags.mean()
print(f"Critique identified tone/urgency issues in {issue_rate:.2%} of cases.")

Critique identified tone/urgency issues in 94.25% of cases.
