In [None]:
import dspy
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

In [None]:
load_dotenv()
project_root = Path.cwd()
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"Added {src_path} to Python path")
print(f"Current working directory: {project_root}")

In [None]:
api_key=os.getenv('GEMINI_API_KEY')
lm = dspy.LM("gemini/gemini-2.5-flash-lite", temperature=1, api_key=api_key)
dspy.configure(lm=lm)

In [None]:
# Load the dataset
df = pd.read_csv("pre_guardrails_dataset.csv")

# The is_valid column is already boolean type in the CSV
# Check the data type
print(f"is_valid dtype: {df['is_valid'].dtype}")
print(f"Loaded {len(df)} examples")
df.head()

In [None]:
training_examples = []

for idx, row in df.iterrows():
    # Handle NaN values in conversation_history
    conversation_history = row['conversation_history']
    if pd.isna(conversation_history) or conversation_history == '':
        conversation_history = None

    # Handle NaN values in reason
    reason = row['reason']
    if pd.isna(reason):
        reason = ""

    # Create Example with inputs and expected outputs
    example = dspy.Example(
        user_message=row['user_input'],
        previous_conversation=conversation_history,
        page_context="",  # Not provided in dataset
        is_valid=row['is_valid'],
        violation_type="" if row['is_valid'] else "unknown",
        user_friendly_message="" if row['is_valid'] else str(reason)
    ).with_inputs('user_message', 'previous_conversation', 'page_context')

    training_examples.append(example)

print(f"Created {len(training_examples)} DSPy Examples")
print(f"\nSample Example:")
print(f"  Input: {training_examples[0].user_message}")
print(f"  Expected is_valid: {training_examples[0].is_valid}")
print(f"  Expected message: {training_examples[0].user_friendly_message}")

In [None]:
def guardrail_accuracy_metric(example, prediction, trace=None):
    """
    DSPy metric for evaluating input guardrail predictions.

    This metric evaluates the core LLM validation (Layer 2) only,
    not the pattern-based quick checks (Layer 1).

    Args:
        example: DSPy.Example containing ground truth 'is_valid' field
        prediction: Model prediction with 'is_valid' field
        trace: Optional trace for debugging

    Returns:
        float: 1.0 if prediction matches ground truth, 0.0 otherwise
    """
    ground_truth = example.is_valid

    # Handle different prediction formats
    if hasattr(prediction, 'is_valid'):
        predicted = prediction.is_valid
    elif isinstance(prediction, dict) and 'is_valid' in prediction:
        predicted = prediction['is_valid']
    else:
        # If prediction doesn't have is_valid, it's likely a string error response
        # Treat as invalid (False)
        predicted = False

    # Convert string booleans if needed
    if isinstance(predicted, str):
        predicted = predicted.lower() in ('true', 'yes', '1')

    # Both should be boolean values
    return float(ground_truth == predicted)

In [None]:
from app.llm.guardrails import PreGuardrails, GuardrailViolation

# Initialize the production module (it will auto-load the optimized model)
guardrail_module = PreGuardrails()

print("GuardrailModule created successfully")
print("Module uses ChainOfThought with InputGuardrailSignature")
print("This module ONLY does Layer 2 LLM validation (not Layer 1 pattern checks)")

In [None]:
from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(
    training_examples,
    test_size=0.2,
    random_state=42,
    stratify=[ex.is_valid for ex in training_examples]  # Ensure balanced split
)

print(f"Training set: {len(train_set)} examples")
print(f"Validation set: {len(val_set)} examples")
print(f"\nTraining set distribution:")
print(f"  Valid inputs: {sum(1 for ex in train_set if ex.is_valid)}")
print(f"  Invalid inputs: {sum(1 for ex in train_set if not ex.is_valid)}")
print(f"\nValidation set distribution:")
print(f"  Valid inputs: {sum(1 for ex in val_set if ex.is_valid)}")
print(f"  Invalid inputs: {sum(1 for ex in val_set if not ex.is_valid)}")

In [None]:
from dspy.evaluate import Evaluate

evaluator = Evaluate(
    devset=val_set,
    metric=guardrail_accuracy_metric,
    num_threads=2,
    display_progress=True,
    display_table=5
)

In [None]:
evaluator(guardrail_module)

In [None]:
def feedback_is_valid(gold, pred, trace=None, pred_name=None, pred_trace=None):
    print(gold, pred, trace, pred_name, pred_trace)
    """
    Generate feedback.
    """
    score = 1.0 if gold['is_valid'] == pred['is_valid'] else 0.0
    if gold == pred:
        feedback = f"You correctly classified the is_valid of the message as `{gold['is_valid']}`. This message is indeed of `{gold['is_valid']}` is_valid."
    else:
        feedback = f"You incorrectly classified the is_valid of the user message as `{pred['is_valid']}`. The correct is_valid is `{gold['is_valid']}`. Think about how you could have reasoned to get the correct is_valid label."

    return dspy.Prediction(score=score, feedback=feedback)

In [None]:
from dspy import GEPA

optimizer = GEPA(
    metric=feedback_is_valid,
    auto="light", # <-- We will use a light budget for this tutorial. However, we typically recommend using auto="heavy" for optimized performance!
    num_threads=3,
    track_stats=True,
    use_merge=False,
    reflection_lm=dspy.LM(model="gemini/gemini-2.5-pro", temperature=1.0, max_tokens=32000, api_key=api_key)
)

In [None]:
optimized_program = optimizer.compile(
    guardrail_module,
    trainset=train_set,
    valset=val_set,
)

In [None]:
for name, pred in optimized_program.named_predictors():
    print("================================")
    print(f"Predictor: {name}")
    print("================================")
    print("Prompt:")
    print(pred.signature.instructions)
    print("*********************************")

In [None]:
evaluator(optimized_program)