In [1]:
import os
import sys
import json
import pandas as pd
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import seaborn as sns

sys.path.append(os.path.abspath(os.path.join('..', 'src')))


load_dotenv(dotenv_path=os.path.join('..', '.env'))
from prompt_utils import create_evaluation_prompt
from llm_utils import evaluate_conversation
print("Setup complete.")

Setup complete.


In [2]:
def load_logs(base_dir='../logs'):
    all_conversations = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.json'):
                log_path = os.path.join(root, file)
                # Extract the tutor type from the directory path
                tutor_type = os.path.basename(root)
                with open(log_path, 'r') as f:
                    history = json.load(f)
                    all_conversations.append({
                        'file_path': log_path,
                        'tutor_type': tutor_type,
                        'history': history
                    })
    return all_conversations

conversation_logs = load_logs()
print(f"Found {len(conversation_logs)} conversation logs to evaluate.")

Found 4 conversation logs to evaluate.


In [3]:
# --- Load the ground truth analysis data ---
analysis_path = '../results/student_drafts_with_analysis.csv'
try:
    df_analysis = pd.read_csv(analysis_path)
    # Set draft_id as the index for easy lookup
    df_analysis.set_index('draft_id', inplace=True)
    print("Loaded ground truth analysis file.")
except FileNotFoundError:
    print(f"ERROR: Ground truth file not found at {analysis_path}. Please run Notebook 01.")
    df_analysis = None

# --- Run the Evaluation ---
evaluation_results = []

if df_analysis is not None:
    for log in tqdm(conversation_logs, desc="Evaluating Conversations"):
        try:
            # Extract draft_id from the filename (e.g., conversation_draft_101_....json)
            draft_id = int(log['file_path'].split('/')[-1].split('_')[2])
            
            # Fetch the ground truth for this specific conversation
            ground_truth = df_analysis.loc[draft_id][['ai_error_type', 'ai_detailed_explanation']].to_dict()
            
            # Call the updated evaluation function with the ground truth
            evaluation = evaluate_conversation(log['history'], ground_truth)
            
            if evaluation:
                result = evaluation.model_dump()
                result['tutor_type'] = log['tutor_type']
                result['file_path'] = log['file_path']
                evaluation_results.append(result)
            else:
                print(f"Failed to evaluate {log['file_path']}")
        except (KeyError, IndexError, ValueError):
            print(f"Could not parse draft_id or find ground truth for {log['file_path']}")

# Convert the list of dictionaries to a DataFrame
eval_df = pd.DataFrame(evaluation_results)

print("\nEvaluation complete!")
eval_df.head()

Loaded ground truth analysis file.


Evaluating Conversations:   0%|          | 0/4 [00:00<?, ?it/s]

E0000 00:00:1759435855.615659 70023622 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.



Evaluation complete!


Unnamed: 0,goal_achievement_score,socratic_guidance_score,empathy_and_tone_score,conciseness_and_clarity_score,student_engagement_score,conversational_efficiency_score,justification,tutor_type,file_path
0,3,4,5,5,4,3,The tutor demonstrated excellent empathy and t...,non_context_aware,../logs/non_context_aware/conversation_draft_1...
1,5,5,5,4,5,5,The tutor excelled at guiding the student to i...,non_context_aware,../logs/non_context_aware/conversation_draft_1...
2,3,5,5,5,4,5,The tutor demonstrated exceptional Socratic gu...,context_aware,../logs/context_aware/conversation_draft_101_2...
3,5,5,5,4,4,5,The tutor excelled at guiding the student to i...,context_aware,../logs/context_aware/conversation_draft_101_2...


In [4]:
if not eval_df.empty:
    # Select only the score columns for the heatmap
    score_columns = [col for col in eval_df.columns if col.endswith('_score')]
    
    # Use pivot_table to average scores if there are multiple logs per tutor type
    comparison_df = eval_df.pivot_table(index='tutor_type', values=score_columns, aggfunc='mean')
    
    # Clean up column names for better display
    comparison_df.columns = [col.replace('_score', '').replace('_', ' ').title() for col in comparison_df.columns]
    
    # Define a modern, high-contrast colormap
    modern_blue_cmap = sns.light_palette("#0d47a1", as_cmap=True)

    print("--- Average Scores Comparison ---")
    styled_df = comparison_df.style.background_gradient(cmap=modern_blue_cmap, axis=None).format("{:.2f}")
    display(styled_df)
    
    # Display the qualitative justifications as well
    print("\n--- Justifications for Each Conversation ---")
    justifications_df = eval_df[['tutor_type', 'justification']].reset_index(drop=True)
    display(justifications_df)
else:
    print("No evaluation results to display.")

--- Average Scores Comparison ---


Unnamed: 0_level_0,Conciseness And Clarity,Conversational Efficiency,Empathy And Tone,Goal Achievement,Socratic Guidance,Student Engagement
tutor_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
context_aware,4.5,5.0,5.0,4.0,5.0,4.0
non_context_aware,4.5,4.0,5.0,4.0,4.5,4.5



--- Justifications for Each Conversation ---


Unnamed: 0,tutor_type,justification
0,non_context_aware,The tutor demonstrated excellent empathy and t...
1,non_context_aware,The tutor excelled at guiding the student to i...
2,context_aware,The tutor demonstrated exceptional Socratic gu...
3,context_aware,The tutor excelled at guiding the student to i...


In [6]:
justifications_df[justifications_df['tutor_type'] == 'context_aware'].to_dict()

{'tutor_type': {2: 'context_aware', 3: 'context_aware'},
 'justification': {2: "The tutor demonstrated exceptional Socratic guidance, empathy, conciseness, and efficiency. They meticulously guided the student digit-by-digit through the multiplication where the error occurred, without giving any direct answers. The tone was consistently supportive and encouraging, and the questions were laser-focused on leading the student to their specific mistake in the hundreds column of the partial product. While the conversation log ends just before the student reaches the 'aha!' moment and fully corrects the error, the tutor's last prompt perfectly sets up this discovery in the very next turn, showing a deep understanding of the ground truth error and an efficient path to its correction. The student remained engaged throughout the process.",
  3: "The tutor excelled at guiding the student to identify and correct the specific calculation error detailed in the ground truth context (forgetting to add

In [7]:
justifications_df[justifications_df['tutor_type'] == 'non_context_aware'].to_dict()

{'tutor_type': {0: 'non_context_aware', 1: 'non_context_aware'},
 'justification': {0: "The tutor demonstrated excellent empathy and tone throughout the conversation, maintaining a supportive and patient demeanor. The responses were consistently clear and concise. The initial guidance was appropriately Socratic, asking the student to explain their steps. The tutor successfully elicited the crucial incorrect multiplication product (3380) from the student, which directly relates to the ground truth error. However, after the student provided '3380?', the tutor took an additional turn asking about the 'reduction' (subtraction) before explicitly guiding the student to double-check the multiplication. While the final turn correctly targets the ground truth error, this slight detour reduced the conversational efficiency and delayed the direct focus on the identified mistake. The student was engaged throughout, but the 'aha!' moment and the full correction of the error had not yet occurred wit