In [2]:
from __future__ import annotations

import re
import time
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, List, Literal

import pandas as pd

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.tools import tool

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
INPUT_XLSX = "/Users/oashii/Desktop/ai agent test/accuracy.xlsx"
OUTPUT_XLSX = "/Users/oashii/Desktop/ai agent test/units1_accuracy.xlsx"
MESSAGE_COL = "message"
GROUP_COL = "Group"
UNIT_COL = "Unit"
LABELS = ["Not in text", "In text", "Wrong", "Incomplete"]
LABEL_SET = set(LABELS)
OUTPUT_LABEL_COL = "Final Category"
OUTPUT_CONF_COL = "Confidence"
OUTPUT_DISAGREE_COL = "Verifier Disagreed"
OUTPUT_NEEDS_REVIEW_COL = "Needs Review"
OUTPUT_CLASSIFIER_RAW_COL = "Classifier Raw"
OUTPUT_VERIFIER_RAW_COL = "Verifier Raw"
OUTPUT_ACTION_COL = "Agent Action"

## Overview

We are labeling each **unit** (one sentence) into exactly one category:

- **Not in text**
- **In text**
- **Wrong**
- **Incomplete**
To make the labeling more reliable, we use **two LLM steps**:
1. **Classifier**: proposes a label for the unit.
2. **Verifier**: checks the proposed label and either agrees or corrects it.
Then we uses the verifier result to decide the final label + confidence.
## Prompt 1 ‚Äî SYSTEM_PROMPT (Classifier Instructions + Ground Truth)
It contains:
- The task context (participants saw materials and wrote summaries)
- The full **materials** (abstract + press release)
- The **definitions** of the 4 categories
- A strict instruction that the model must output only one category label
## Prompt 2 ‚Äî CLASSIFIER_USER_PROMPT (Unit Input + Format Enforcement)
Injects the specific unit we want to classify:
- It provides the **unit sentence**
- It repeats the allowed labels
- It repeats ‚ÄúReturn ONLY one label‚Äù
## Prompt 3 ‚Äî VERIFIER_SYSTEM_PROMPaT (Quality Control Role)
This prompt turns the model into a strict checker.
It tells the model:
- You will see the unit + the proposed label
- Your job is to validate the label
- Output MUST be one of:
  - AGREE
  - DISAGREE: <CorrectLabel>
## Prompt 4 ‚Äî VERIFIER_USER_PROMPT(Verifier Inputs)
This passes the verifier exactly:
- The unit
- The proposed label from the classifier
And reminds the output format again.

In [4]:
SYSTEM_PROMPT = r"""
Introduction
In a current study, participants were presented with the abstract and press release of a published research article ‚Äì we refer to this information as their ‚Äúmaterials.‚Äù Their task was to write a headline and a 3-sentence summary about the research article. Your task is to code the writing according to the instructions below.
Each summary written by a participant has been split into units of information (sentences).

Source or Accuracy of Information: Compare each unit against the text of ‚Äúmaterials‚Äù that is provided, and use one of the categories below.

**This is the text of "materials"**
Abstract
Sensory processing continues during sleep and can influence brain oscillations. We previously showed that a gentle rocking stimulation (0.25 Hz), during an afternoon nap, facilitates wake-sleep transition and boosts endogenous brain oscillations (i.e., EEG spindles and slow oscillations [SOs]). Here, we tested the hypothesis that the rhythmic rocking stimulation synchronizes sleep oscillations, a neuro-physiological mechanism referred to as ‚Äò‚Äòneural entrainment.‚Äô‚Äô We analyzed EEG brain responses related to the stimulation recorded from 18 participants while they had a full night of sleep on a rocking bed. Moreover, because sleep oscillations are considered of critical relevance for memory processes, we also investigated whether rocking influences overnight declarative memory consolidation. We first show that, compared to a stationary night, continuous rocking shortened the latency to non-REM (NREM) sleep and strengthened sleep maintenance, as indexed by increased NREM stage 3 (N3) duration and fewer arousals. These beneficial effects were paralleled by an increase in SOs and in slow and fast spindles during N3, without affecting the physiological SO-spindle phase coupling. We then confirm that, during the rocking night, overnight memory consolidation was enhanced and also correlated with the increase in fast spindles, whose co-occurrence with the SO up-state is considered to foster cortical synaptic plasticity. Finally, supporting the hypothesis that a rhythmic stimulation entrains sleep oscillations, we report a temporal clustering of spindles and SOs relative to the rocking cycle. Altogether, these findings demonstrate that a continuous rocking stimulation strengthens deep sleep via the neural entrainment of intrinsic sleep oscillations.

Summary
"Having a good night's sleep means falling asleep rapidly and then staying asleep during the whole night," says Laurence Bayer of the University of Geneva, Switzerland. "Our volunteers--even if they were all good sleepers--fell asleep more rapidly when rocked and had longer periods of deeper sleep associated with fewer arousals during the night. We thus show that rocking is good for sleep."
Bayer and their colleagues had earlier shown that continuous rocking during a 45-minute nap helped people to fall asleep faster and sleep more soundly. In the new study, led by Laurence Bayer and Sophie Schwartz, University of Geneva, Switzerland, they wanted to explore the effects of rocking on sleep and its associated brain waves throughout the night.
The researchers enlisted 18 healthy young adults to undergo sleep monitoring in the lab. The first night was intended to get them used to sleeping there. They then stayed two more nights--one sleeping on a gently rocking bed and the other sleeping on an identical bed that wasn't moving.
The data showed that participants fell asleep faster while rocking. Once asleep, they also spent more time in non-rapid eye movement sleep, slept more deeply, and woke up less.
Next, the researchers wanted to know how that better sleep influenced memory. To assess memory consolidation, participants studied word pairs. The researchers then measured their accuracy in recalling those paired words in an evening session compared to the next morning when they woke up. They found that people did better on the morning test when they were rocked during sleep.
Further studies showed that rocking affects brain oscillations during sleep. They saw that the rocking motion caused an entrainment of specific brain oscillations of non-rapid eye movement sleep (slow oscillations and spindles). As a result, the continuous rocking motion helped to synchronize neural activity in the thalamo-cortical networks of the brain, which play an important role in both sleep and memory consolidation.

**These are the categories**
1. ‚ÄúNot in text‚Äù: If the unit contains information that was NOT in the materials.
2. ‚ÄúIn text‚Äù: If the unit contains complete information about a point/finding that was part of the materials.
3. ‚ÄúWrong‚Äù: If the unit contains at least one incorrect statement about the materials (including misleading causal language, design errors, etc.).
4. ‚ÄúIncomplete‚Äù: If the unit contains information that is in the materials but is vague or missing key context/details.

IMPORTANT OUTPUT RULE:
Return ONLY ONE of these exact labels, with no extra text:
Not in text
In text
Wrong
Incomplete
"""

CLASSIFIER_USER_PROMPT = """Unit to categorize:
"{unit}"

Return ONLY one label:
Not in text
In text
Wrong
Incomplete
"""

VERIFIER_SYSTEM_PROMPT = r"""
You are a strict verifier for a research coding task.
You will be given:
(1) The same coding instructions + materials
(2) A unit (one sentence)
(3) A proposed label

Rules:
- If proposed label is correct: output exactly "AGREE"
- If incorrect: output exactly "DISAGREE: <CorrectLabel>"

<CorrectLabel> must be exactly one of:
Not in text
In text
Wrong
Incomplete

No other text.
"""

VERIFIER_USER_PROMPT = """Unit:
"{unit}"

Proposed label: {proposed_label}

Output:
AGREE
or
DISAGREE: <CorrectLabel>
"""


- Standarilize our model's output into 4 valid labels, or return None if it can‚Äôt.

In [5]:
def normalize_label(text: str) -> Optional[str]: 
    if not text:
        return None
    t = text.strip()
    if t in LABEL_SET:
        return t

    t_low = t.lower()
    mapping = {
        "in_text": "In text",
        "in text": "In text",
        "not_in_text": "Not in text",
        "not in text": "Not in text",
        "not-in-text": "Not in text",
        "wrong": "Wrong",
        "incomplete": "Incomplete",
    }
    if t_low in mapping:
        return mapping[t_low]

    for lab in LABELS:
        if lab.lower() in t_low:
            return lab
    return None


- Interpret verifier output into a simple program decision:

(True, None) means the verifier said AGREE

(False, corrected_label) means the verifier said DISAGREE: <CorrectLabel>

(False, None) means verifier output was malformed/unexpected

In [6]:
def parse_verifier(text: str) -> Tuple[bool, Optional[str]]:
    if not text:
        return False, None
    t = text.strip()
    if t == "AGREE":
        return True, None
    m = re.match(r"^DISAGREE:\s*(.+)\s*$", t)
    if not m:
        return False, None
    return False, normalize_label(m.group(1))

- Our Excel file has a message column that resets to 1 at the start of each new participant summary (e.g., 1, 2, 3, ... and then back to 1).
- LLM API calls can fail sometimes (for example, rate limits, temporary network issues, or server timeouts).Instead of crashing the whole script, we will retry the call a few times.

In [7]:
def build_group_ids(df: pd.DataFrame, message_col: str) -> pd.Series:
    resets = (df[message_col] == 1).astype(int)
    return resets.cumsum()


def invoke_with_retries(fn, *args, max_retries: int = 2, backoff: float = 1.0, **kwargs):
    last_err = None
    for attempt in range(max_retries + 1):
        try:
            return fn(*args, **kwargs)
        except Exception as e:
            last_err = e
            if attempt < max_retries:
                time.sleep(backoff * (attempt + 1))
            else:
                raise last_err

- build_classifier_chain() creates a LangChain pipeline (prompt ‚Üí LLM ‚Üí string) that takes a unit and returns exactly one label: Not in text / In text / Wrong / Incomplete.
- build_verifier_chain() creates a second pipeline that takes the unit plus the proposed label and returns AGREE or DISAGREE: <CorrectLabel> as a quality check.

In [8]:
def build_classifier_chain(llm: ChatOpenAI):
    prompt = ChatPromptTemplate.from_messages(
        [("system", SYSTEM_PROMPT), ("user", CLASSIFIER_USER_PROMPT)]
    )
    return prompt | llm | StrOutputParser()

def build_verifier_chain(llm: ChatOpenAI):
    prompt = ChatPromptTemplate.from_messages(
        [("system", VERIFIER_SYSTEM_PROMPT + "\n\n" + SYSTEM_PROMPT), ("user", VERIFIER_USER_PROMPT)]
    )
    return prompt | llm | StrOutputParser()

- wraps our two LangChain pipelines into ‚Äútools‚Äù (named callable actions) so the agent/router can call them like functions.

In [9]:
def make_tools(classifier_chain, verifier_chain):
    @tool("classify_unit")
    def classify_unit(unit: str) -> str:
        """Classify a unit into exactly one label: Not in text / In text / Wrong / Incomplete."""
        return classifier_chain.invoke({"unit": unit}).strip()

    @tool("verify_label")
    def verify_label(unit: str, proposed_label: str) -> str:
        """Verify a proposed label. Return AGREE or DISAGREE: <CorrectLabel>."""
        return verifier_chain.invoke({"unit": unit, "proposed_label": proposed_label}).strip()

    return classify_unit, verify_label

## ToolCallingRouterAgent.run():
- (1) calls the classifier tool, normalizes the label, and reruns once if the label is invalid; 
- (2) if still invalid, returns a low-confidence fallback; 
- (3) otherwise calls the verifier tool‚Äîif the verifier says AGREE, it accepts with high confidence; if it says DISAGREE with a valid corrected label, it overrides with medium confidence; if verifier output is malformed, it keeps the original label but marks low confidence and needs review.

In [10]:
AgentAction = Literal[
    "ACCEPT_HIGH",
    "OVERRIDE_MEDIUM",
    "MALFORMED_CLASSIFIER_LOW",
    "MALFORMED_VERIFIER_LOW",
]

@dataclass
class AgentResult:
    final_label: str
    confidence: str
    verifier_disagreed: bool
    needs_review: bool
    classifier_raw: str
    verifier_raw: str
    action: AgentAction


class ToolCallingRouterAgent:
    def __init__(self, classify_tool, verify_tool, max_classifier_reruns: int = 1):
        self.classify_tool = classify_tool
        self.verify_tool = verify_tool
        self.max_classifier_reruns = max_classifier_reruns

    def run(self, unit: str) -> AgentResult:
        classifier_raw = invoke_with_retries(self.classify_tool.invoke, {"unit": unit})
        proposed = normalize_label(classifier_raw)

        reruns = 0
        while proposed is None and reruns < self.max_classifier_reruns:
            reruns += 1
            classifier_raw = invoke_with_retries(self.classify_tool.invoke, {"unit": unit})
            proposed = normalize_label(classifier_raw)

        if proposed is None:
            return AgentResult(
                final_label="Not in text",  # fallback
                confidence="low",
                verifier_disagreed=False,
                needs_review=True,
                classifier_raw=classifier_raw,
                verifier_raw="",
                action="MALFORMED_CLASSIFIER_LOW",
            )

        verifier_raw = invoke_with_retries(
            self.verify_tool.invoke, {"unit": unit, "proposed_label": proposed}
        )
        agrees, corrected = parse_verifier(verifier_raw)

        if agrees:
            return AgentResult(
                final_label=proposed,
                confidence="high",
                verifier_disagreed=False,
                needs_review=False,
                classifier_raw=classifier_raw,
                verifier_raw=verifier_raw,
                action="ACCEPT_HIGH",
            )

        if corrected in LABEL_SET:
            return AgentResult(
                final_label=corrected,
                confidence="medium",
                verifier_disagreed=True,
                needs_review=True,
                classifier_raw=classifier_raw,
                verifier_raw=verifier_raw,
                action="OVERRIDE_MEDIUM",
            )

        return AgentResult(
            final_label=proposed,
            confidence="low",
            verifier_disagreed=True,
            needs_review=True,
            classifier_raw=classifier_raw,
            verifier_raw=verifier_raw,
            action="MALFORMED_VERIFIER_LOW",
        )


- The script runs the agent on every row (unit) in the Excel file.

- For each group, it loops through the rows, reads row[UNIT_COL], and calls agent.run(unit) to produce the label + confidence + flags.

- It then writes those results back into the same original table, adding new columns‚Äîso the output stays flat: one row = one unit

In [11]:
def process_groups(df: pd.DataFrame, agent: ToolCallingRouterAgent) -> pd.DataFrame:
    results: List[AgentResult] = []

    groups = list(df.groupby(GROUP_COL))
    total_groups = len(groups)

    for i, (group_id, group) in enumerate(groups):
        print(f"\nProcessing group {group_id} ({i+1}/{total_groups})...")

        for _, row in group.iterrows():
            unit = row.get(UNIT_COL, "")
            if pd.isna(unit) or str(unit).strip() == "":
                res = AgentResult(
                    final_label="Error: Empty or Invalid Unit",
                    confidence="low",
                    verifier_disagreed=False,
                    needs_review=True,
                    classifier_raw="",
                    verifier_raw="",
                    action="MALFORMED_CLASSIFIER_LOW",
                )
            else:
                res = agent.run(str(unit))

            print(f"Unit: {str(unit)[:60]} -> {res.final_label} ({res.confidence}) [{res.action}]")
            results.append(res)

        if (i + 1) % 30 == 0:
            print(f"\nProcessed {i + 1} groups out of {total_groups}. Pausing briefly...")
            time.sleep(3)

    out = df.copy()
    out[OUTPUT_LABEL_COL] = [r.final_label for r in results]
    out[OUTPUT_CONF_COL] = [r.confidence for r in results]
    out[OUTPUT_DISAGREE_COL] = [r.verifier_disagreed for r in results]
    out[OUTPUT_NEEDS_REVIEW_COL] = [r.needs_review for r in results]
    out[OUTPUT_CLASSIFIER_RAW_COL] = [r.classifier_raw for r in results]
    out[OUTPUT_VERIFIER_RAW_COL] = [r.verifier_raw for r in results]
    out[OUTPUT_ACTION_COL] = [r.action for r in results]
    return out


In [12]:
def main():
    df = pd.read_excel(INPUT_XLSX)
    df[GROUP_COL] = build_group_ids(df, MESSAGE_COL)
    print(f"File loaded successfully! Total rows: {len(df)}. Total groups: {df[GROUP_COL].nunique()}.")
    llm_classifier = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    llm_verifier = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    classifier_chain = build_classifier_chain(llm_classifier)
    verifier_chain = build_verifier_chain(llm_verifier)

    # Tools
    classify_tool, verify_tool = make_tools(classifier_chain, verifier_chain)

    # Agent
    agent = ToolCallingRouterAgent(
        classify_tool=classify_tool,
        verify_tool=verify_tool,
        max_classifier_reruns=1,
    )

    # Run
    out_df = process_groups(df, agent)
    out_df.to_excel(OUTPUT_XLSX, index=False)
    print(f"\nDone. Saved to: {OUTPUT_XLSX}")

if __name__ == "__main__":
    main()

File loaded successfully! Total rows: 30. Total groups: 7.

Processing group 1 (1/7)...
Unit: To Sleep lightly or briefly -> Not in text (medium) [OVERRIDE_MEDIUM]
Unit: Imagine, You Fall asleep faster and sleep more soundly -> Wrong (medium) [OVERRIDE_MEDIUM]
Unit: According to new Research,
 Rocking can improve your Sleep Q -> Wrong (medium) [OVERRIDE_MEDIUM]

Processing group 2 (2/7)...
Unit: Research shows and suggests that gentle rocking can be a hel -> Wrong (medium) [OVERRIDE_MEDIUM]
Unit: Researchers have taken 18 participants to study how a gentle -> Wrong (medium) [OVERRIDE_MEDIUM]
Unit: Researchers also found that the rocking helped the brain‚Äôs o -> In text (high) [ACCEPT_HIGH]
Unit: Additionally, they discovered that the rocking helped people -> Wrong (medium) [OVERRIDE_MEDIUM]
Unit: Meaning, they play a role in memory consolidation -> Not in text (medium) [OVERRIDE_MEDIUM]
Unit: The finding suggests that rocking can be useful in improving -> In text (medium) [OVERRIDE_ME