In [9]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import time
import uuid
import wandb
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import openai
from pydantic import BaseModel, Field, ValidationError
from typing import List

In [10]:
# ✅ Load API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# ✅ Cost per 1K tokens (USD)
MODEL_PRICES = {
    "gpt-3.5-turbo": 0.0015,
    "gpt-4.1-mini": 0.00015,
    "gpt-4.1-nano": 0.00000525
}

In [11]:
from ChatbotPrompts import GUARD_RAIL_TESTS, CUSTOMER_QUERIES, GUARDRAIL_DETECTOR_PROMPT, EVAL_PROMPT_TEMPLATE, FINAL_PROMPT_TEMPLATE

In [12]:
# Define the Pydantic model
class GuardrailResult(BaseModel):
    toxicity: bool
    profanity: bool
    prompt_injection: bool
    issues: List[str]
    summary: str

In [13]:
# ✅ Wrapper for OpenAI call
def call_chat(model, messages, temperature=0.7, max_tokens=500):
    start = time.time()
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    latency = time.time() - start
    usage = response.usage
    content = response.choices[0].message.content
    cost = (usage.prompt_tokens + usage.completion_tokens) / 1000 * MODEL_PRICES[model]
    return content.strip(), usage, cost, latency

In [14]:
# Guardrail checker function
def check_guardrails(text: str, model="gpt-4.1-mini") -> dict:
    prompt = GUARDRAIL_DETECTOR_PROMPT.format(text=text)
    messages = [{"role": "user", "content": prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
        max_tokens=500
    )

    raw_content = response.choices[0].message.content.strip()
    print("Raw LLM response:\n", raw_content)
    
    # Clean markdown formatting (e.g., ```json ... ```)
    cleaned_content = re.sub(r"^```json\n|\n```$", "", raw_content)

    try:
        result = GuardrailResult.model_validate_json(cleaned_content)
        val = json.loads(result.model_dump_json())
        return val
    except ValidationError as e:
        print("Pydantic validation error:", e)
        return {
            "toxicity": False,
            "profanity": False,
            "prompt_injection": False,
            "issues": [],
            "summary": "Invalid JSON or schema mismatch"
        }

In [15]:
# ✅ Customer queries
customer_queries = CUSTOMER_QUERIES

In [16]:
# ✅ Init W&B
run = wandb.init(
    project="customer-support-bot-GuardRails",
    name="customer-support-final-prompt-with guardrails",
    config={
        "prompt_version": "finalized-v1",
        "model": "gpt-4.1-mini",
        "eval_model": "gpt-4.1-mini",
        "temperature": 0.7,
        "max_tokens": 500
    },
    save_code=True
)
config = wandb.config

wandb: Currently logged in as: aayush-drishte (aayush-drishte-tredence) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin




In [None]:
# ✅ W&B Table setup
table = wandb.Table(columns=[
    "id", "persona", "category", "query", "chatbot_response",
    "ground_helpfulness", "ground_correctness", "ground_tone", "ground_safety",
    "eval_helpfulness", "eval_correctness", "eval_tone", "eval_clarity",
    "eval_safety", "eval_toxicity", "eval_profanity",
    "comment_helpfulness", "comment_correctness", "comment_tone",
    "comment_clarity", "comment_safety", "comment_toxicity", "comment_profanity",
    "overall_comment", "tokens_prompt", "tokens_completion", "tokens_total", "cost_usd",
    "query_toxicity", "query_profanity", "query_prompt_injection",
    "response_toxicity", "response_profanity", "response_prompt_injection",
    "guardrail_summary"
])

# ✅ Loop through queries
for query in tqdm(customer_queries):
    query_id = str(uuid.uuid4())

    # Generate bot response
    prompt = FINAL_PROMPT_TEMPLATE.format(customer_query=query)
    bot_response, usage_gen, cost_gen, latency = call_chat(config.model, [{"role": "user", "content": prompt}])

    # Evaluate
    eval_prompt = EVAL_PROMPT_TEMPLATE.format(query=query, chatbot_response=bot_response)
    eval_response, usage_eval, cost_eval, _ = call_chat(config.eval_model, [{"role": "user", "content": eval_prompt}], temperature=0.0)

    try:
        parsed = json.loads(eval_response)
    except:
        parsed = {}

    get_score = lambda k: parsed.get(k, {}).get("score", 0)
    get_comment = lambda k: parsed.get(k, {}).get("comment", "")

    # Add to W&B table
    table.add_data(
        query_id, query, bot_response,
        get_score("helpfulness"), get_score("correctness"), get_score("tone"),
        get_score("clarity"), get_score("safety"), get_score("toxicity"),
        get_score("profanity"), parsed.get("overall_comment", ""),
        usage_gen.prompt_tokens, usage_gen.completion_tokens,
        usage_gen.total_tokens, round(cost_gen + cost_eval, 6)
    )

    wandb.log({
        "query": query,
        "helpfulness": get_score("helpfulness"),
        "correctness": get_score("correctness"),
        "safety": get_score("safety"),
        "clarity": get_score("clarity"),
        "toxicity": get_score("toxicity"),
        "cost_usd": cost_gen + cost_eval,
        "latency": latency
    })

# ✅ Finish logging
wandb.log({"eval_table": table})
wandb.finish()