# Context/Conversation Sweeps

Run the evaluator across multiple `--context-filter` and `--conversation-window` settings. Set your OpenAI key in the cell below before running.

In [1]:
import os
from pathlib import Path

# Set your key; do not commit this cell with a real key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "yourkeyhere")

# Paths and defaults
DATA_PATH = Path("baton-export-2025-11-24-nofullstop.json")
MODEL_NAME = "gpt-4o-mini"  # any OpenAI model supported by your key
SAMPLE_SIZE = 20  # keep small to manage cost/time
CORPUS_RATIO = 0.67

assert DATA_PATH.exists(), f"Data file not found: {DATA_PATH}"

In [None]:
import pandas as pd

from run_evaluation import create_default_methods
from utils.evaluation_utils import ChatHistoryEvaluator

context_filters = ["none", "time", "geo", "time_geo"]
conversation_windows = [0, 1, 3]
time_windows = [2.0]  # hours
geo_radii = [5.0]    # km

partial_methods, generation_methods, evaluation_metrics = create_default_methods()

all_results = []
for context_filter in context_filters:
    for conv_window in conversation_windows:
        for time_window in time_windows:
            for geo_radius in geo_radii:
                print(
                    f"Running: context_filter={context_filter}, conversation_window={conv_window}, "
                    f"time_window_hours={time_window}, geo_radius_km={geo_radius}"
                )
                evaluator = ChatHistoryEvaluator(
                    chat_data_path=str(DATA_PATH),
                    corpus_ratio=CORPUS_RATIO,
                    model_name=MODEL_NAME,
                )
                df = evaluator.run_evaluation(
                    partial_methods=partial_methods,
                    generation_methods=generation_methods,
                    evaluation_metrics=evaluation_metrics,
                    sample_size=SAMPLE_SIZE,
                    context_filter=context_filter,
                    time_window_hours=time_window,
                    geo_radius_km=geo_radius,
                    conversation_window=conv_window,
                )
                if df.empty:
                    print("  (no results)")
                    continue
                df["context_filter"] = context_filter
                df["conversation_window"] = conv_window
                df["time_window_hours"] = time_window
                df["geo_radius_km"] = geo_radius
                all_results.append(df)

if all_results:
    results_df = pd.concat(all_results, ignore_index=True)
    print(f"Total rows: {len(results_df)}")
else:
    results_df = pd.DataFrame()
results_df.head()

Running: context_filter=none, conversation_window=0, time_window_hours=2.0, geo_radius_km=5.0
✅ Models initialized: gpt-4o-mini
✅ Loaded 331 utterances from chat history.
✅ Processed data: 331 utterances with timestamps.
Date range: 2025-11-12 10:41:37.369000+00:00 to 2025-11-19 12:18:52.747000+00:00
Average utterance length: 8.5 words
Corpus (for conditioning): 221 utterances
Test (for evaluation): 110 utterances
Processing example 1/20: 'She could not have swept today...'
Processing example 2/20: 'They are excited as well about...'
Processing example 3/20: 'Are you excited ...'
Processing example 4/20: 'I need that letter, all sides,...'
Processing example 5/20: 'Max's mum has died ...'
Processing example 6/20: 'Any chance of some milk...'
Processing example 7/20: 'How is it looking ...'
Processing example 8/20: 'Are we going to bed ...'
Processing example 9/20: 'I will just bring back a load ...'
Processing example 10/20: 'You must be a bit excited now ...'
Processing example 11/20:

In [None]:
if not results_df.empty:
    summary = (
        results_df.groupby(
            ["context_filter", "conversation_window", "time_window_hours", "geo_radius_km"]
        )
        .agg(
            {
                "embedding_similarity": "mean",
                "llm_judge_score": "mean",
                "character_accuracy": "mean",
                "word_accuracy": "mean",
            }
        )
        .reset_index()
    )
    display(summary)
else:
    print("No results generated.")