In [1]:
import pandas as pd
import math
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any, Tuple, Literal
from note_generation import run

In [2]:
client = OpenAI()

In [3]:
def calc_alignment(transcript: str, note: str):
    """Calculates alignment between summary and its source"""
    class HallucinationItem(BaseModel):
        error: str = Field(..., description = "The exact text span from the model answer that is hallucinated")
        explanation: str= Field(..., description = "Why this was flagged as a hallucination")
        revision: str = Field(..., description = "Revision that should be made to correct the hallucination")
        severity: Literal["major", "moderate", "minor", "none"] = Field(..., description="Severity level of the hallucination")

    class HallucinationReport(BaseModel):
        hallucinations: list[HallucinationItem] = Field(..., description = "List of hallucination items found")
        
    res = client.beta.chat.completions.parse(
        model = "gpt-5",
        messages = [
            {
                "role": "system", 
                "content": 
                    """
                    You are a notetaker for a college class. 
                    Evaluate how well a note aligns with the transcript it represents by detecting any hallucinations.
                    Hallucinations are any parts of the note that are not supported by information in the transcript.
                    Implications are acceptable, as long as they can reasonably be concluded form information in the transcript.
                    """
            },
            {
                "role": "user", 
                "content": f"note: {note} \n transcript: {transcript}"
            }
        ],
        response_format = HallucinationReport
    )
    alignment_report = res.choices[0].message.parsed.model_dump()

    # keep track of errors by severity
    error_distribution = {
        "none": 0, 
        "minor": 0, 
        "moderate": 0, 
        "major": 0
    }
    # penalize hallucinations according to severity
    penalties = {
        "none": 0,
        "minor": 1,
        "moderate": 2,
        "major": 4
    }

    penalty = 0
    for item in alignment_report["hallucinations"]:
        penalty += penalties[item["severity"]]
        error_distribution[item["severity"]] += 1

    return math.exp(-0.1 * penalty), error_distribution, alignment_report

In [4]:
def calc_coverage(transcript: str, note: str):
    """Calculates coverage between summary and its source"""
    class KeyPoints(BaseModel):
        key_points: list[str] = Field(..., description = "Key points identified")
        
    # identify important points in transcript
    def find_key_points():
        res = client.beta.chat.completions.parse(
            model = "gpt-5",
            messages = [
                {
                    "role": "system", 
                    "content": 
                        """
                        You are a notetaker for a college class.
                        Identify all key points in the following lecture transcript.
                        Key points are relevant to the student's understanding of the course and lecture material.
                        Any point that is not essential for a student's understanding should not be included.
                        """
                },
                {
                    "role": "user", 
                    "content": f"transcript: {transcript}"
                }
            ],
            response_format = KeyPoints
        )
        key_points = res.choices[0].message.parsed.model_dump()["key_points"]
        return key_points
    
    # identify important points that weren't captured by summary
    key_points = find_key_points()

    res = client.beta.chat.completions.parse(
        model = "gpt-5",
        messages = [
            {
                "role": "system", 
                "content": 
                    """
                    You are a notetaker for a college class. 
                    Given a set of key points from a lecture transcript, identify any key points that aren't captured by the following note.
                    """
            },
            {
                "role": "user", 
                "content": f"key points: {key_points} \n note: {note}"
            }
        ],
        response_format = KeyPoints
    )
    # return capture rate
    uncaptured_points = res.choices[0].message.parsed.model_dump()["key_points"]
    return (1 - len(uncaptured_points) / len(key_points)), key_points, uncaptured_points

In [5]:
def run_eval(json_path: str, transcript_path: str):
    """Returns a dataframe with metrics for measuring alignment & coverage of a note"""
    notes, checks, structured_output, transcript_sections, transcript = run(json_path, 
                                                                            transcript_path)
    
    index = 0
    records = []
    for i in range(len(notes)):
        if checks[i]:
            alignment = calc_alignment(transcript_sections[index], notes[i])
            coverage = calc_coverage(transcript_sections[index], notes[i])
            records.append(
                (
                    alignment[0],
                    coverage[0],
                    alignment[1]["none"],
                    alignment[1]["minor"],
                    alignment[1]["moderate"],
                    alignment[1]["major"]
                )
            )
            index += 1

    columns = ["alignment", "coverage", "false errors", "minor errors", "moderate errors", "major errors"]
    df = pd.DataFrame(data = records, columns = columns)
    return df

In [6]:
json_path = "outline_template.json"
transcript_path = "transcripts/ww1.txt"

In [7]:
metrics = run_eval(json_path, transcript_path)
metrics.head()

Unnamed: 0,alignment,coverage,false errors,minor errors,moderate errors,major errors
0,0.182684,0.666667,0,5,6,0
1,0.045049,1.0,0,7,12,0
2,0.272532,0.75,0,5,2,1
3,0.182684,1.0,0,3,3,2
4,0.020242,1.0,0,5,11,3
