In [2]:
import json
import re
from pathlib import Path
from typing import Dict, Any
import random
import pandas as pd
from tqdm import tqdm
import ollama

In [5]:

MODEL = "deepseek-r1:1.5b"     
TEMPERATURE = 0.2
PERSONAS_CSV = "Data/synthetic_climate_personas.csv"
CLAIMS_JSON = "Data/climate-fever-dataset.json"
CLAIMS_CSV = "Data/climate-fever-dataset.csv"
OUT_CSV = "agents_claim_outputs.csv"
N_PERSONAS   = 1
N_CLAIMS     = 2
SEED         = 42   # set None to keep original order



In [None]:
SYSTEM_TMPL = """You are simulating the thought process of a person with the following persona:

{PERSONA_DESCRIPTION}

Your task is to evaluate claims about climate change.
- Always respond as this person would, considering their background, values, and beliefs.
- You cannot access new facts beyond what is given.
- Your reasoning may be influenced by your prior beliefs (this is natural).
- Be consistent in personality and tone across all answers.

For each claim you see, you will:
1. Read the claim carefully.
2. Decide whether you accept the claim or not.
3. Give your stance on whether you support or not support the claim. You should respond by either "Support" or "Not Support".
4. Give your stance on climate chnage existance . Your should respond by only on of the followings: "Strongly disagree", "Slightly Disagree", "Neutral", "Slightly Agree" and "Strongly Agree".
"""

CLAIM_TMPL = """Claim: {CLAIM_TEXT},
Given above claim return : 
1. your stance on whether you support or not support the claim. You should respond by either "Support" or "Not Support".
4. your stance on climate chnage existance . Your should respond by only on of the followings: "Strongly disagree", "Slightly Disagree", "Neutral", "Slightly Agree" and "Strongly Agree".

Your output should be strictly in the following format : 
{{
"climateChangeStance" : [your stances],
"claimStance" : [your stance]
}}"""

REGEX_HINT = r'^\{"climateChangeStance":"(Strongly disagree|Slightly Disagree|Neutral|Slightly Agree|Strongly Agree)","claimStance":"(Support|Not Support)"\}$'

BELIEF_ALLOWED = {
    "Strongly disagree","Slightly Disagree","Neutral","Slightly Agree","Strongly Agree"
}
CLAIM_ALLOWED = {"Support","Not Support"}

def build_persona_description(row: pd.Series) -> str:
    return (
        f"- PersonaID: {row.get('PersonaID')}\n"
        f"- AgeGroup: {row.get('AgeGroup')}\n"
        f"- Gender: {row.get('Gender')}\n"
        f"- EducationLevel: {row.get('EducationLevel')}\n"
        f"- OccupationSector: {row.get('OccupationSector')}\n"
        f"- Region: {row.get('Region')}\n"
        f"- PoliticalIdeology: {row.get('PoliticalIdeology')}\n"
        f"- Trust_ScienceInstitutions: {row.get('Trust_ScienceInstitutions')}\n"
        f"- Belief_ClimateExists: {row.get('Belief_ClimateExists')}\n"
        f"- Belief_HumanContribution: {row.get('Belief_HumanContribution')}\n"
        f"- Emotional_WorryAboutClimate: {row.get('Emotional_WorryAboutClimate')}\n"
        f"- BehaviouralOrientation: {row.get('BehaviouralOrientation')}\n"
        f"- SocialConnectivity: {row.get('SocialConnectivity')}"
    )

def chat_once(system_msg: str, user_msg: str) -> str:
    r = ollama.chat(
        model=MODEL,
        options={"temperature": TEMPERATURE},
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg + f"\n\nReturn JSON matching this regex: {REGEX_HINT}"},
        ],
    )
    return r["message"]["content"].strip()

def coerce_json(text: str) -> Dict[str, Any]:
    """  This is fix messy text output from the AI and make sure the final stance answers are ok"""
    try:
        obj = json.loads(text)
    except Exception:
        s, e = text.find("{"), text.rfind("}")
        if s == -1 or e == -1 or e <= s:
            raise ValueError(f"No JSON object found in: {text[:120]}...")
        obj = json.loads(text[s:e+1])

    claim = str(obj.get("claimStance", "")).strip()
    belief = str(obj.get("climateChangeStance", "")).strip()

    if claim not in CLAIM_ALLOWED:
        claim = "Support" if "support" in claim.lower() and "not" not in claim.lower() else "Not Support"
    canon_map = {v.lower(): v for v in BELIEF_ALLOWED}
    belief = canon_map.get(belief.lower(), "Neutral")

    return {"claimStance": claim, "climateChangeStance": belief}

def load_claims_with_label_JSON(path: str):

    raw = json.loads(Path(path).read_text())
    out = []
    for it in raw:
        claim_text = (
            it.get("claim") or it.get("claim_text") or it.get("statement") or it.get("text") or ""
        )
        if not claim_text:
            continue
        out.append({
            "claim_id": it.get("claim_id") or it.get("id"),
            "claim_text": claim_text,
            "claim_stance_label": it.get("label") or it.get("claim_label") or it.get("verdict") or it.get("stance")
        })
    return out

def load_claims_with_label_CSV(path: str):
    try:
        df = pd.read_csv(Path(path))
    except Exception as e:
        print(f"Error reading CSV file at {path}: {e}")
        return []

    raw = df.to_dict('records')
    out = []
    for it in raw:
        claim_text = (
            it.get("claim") or it.get("claim_text") or it.get("statement") or it.get("text") or ""
        )
        if not claim_text:
            continue
        
        out.append({
            "claim_id": it.get("claim_id") or it.get("id"),
            "claim_text": claim_text,
            "claim_stance_label": it.get("stance_label")
        })
        
    return out

def filter_balanced_claims(df: pd.DataFrame, n_each: int = 100) -> pd.DataFrame:

    label_col = None
    for c in ["stance_label", "label", "verdict"]:
        if c in df.columns:
            label_col = c
            break
    if label_col is None:
        raise ValueError("No stance label column found (expected 'stance_label', 'label', or 'verdict').")

    supports = df[df[label_col].str.upper() == "SUPPORTS"]
    refutes = df[df[label_col].str.upper() == "REFUTES"]

    n_each = min(n_each, len(supports), len(refutes))
    supports_sample = supports.sample(n=n_each, random_state=42)
    refutes_sample = refutes.sample(n=n_each, random_state=42)

    return pd.concat([supports_sample, refutes_sample]).reset_index(drop=True)



def main():
    personas = pd.read_csv(PERSONAS_CSV)
    # claims = load_claims_with_label_JSON(CLAIMS_JSON)
    claims = load_claims_with_label_CSV(CLAIMS_CSV)


    # For test purpose
    if SEED is not None:
        personas = personas.sample(n=min(N_PERSONAS, len(personas)), random_state=SEED)
        random.seed(SEED)
        claims = (
            pd.DataFrame(claims)
            .sample(n=min(N_CLAIMS, len(claims)), random_state=SEED)
            .to_dict(orient="records")
        )
    else:
        personas = personas.head(N_PERSONAS)
        claims = claims[:N_CLAIMS]


    records = []
    for _, prow in tqdm(personas.iterrows(), total=len(personas), desc="Personas"):
        persona_desc = build_persona_description(prow)
        system_msg = SYSTEM_TMPL.replace("{PERSONA_DESCRIPTION}", persona_desc)

        for c in claims:
            user_msg = CLAIM_TMPL.replace("{CLAIM_TEXT}", str(c["claim_text"]))
            raw = chat_once(system_msg, user_msg)
            try:
                parsed = coerce_json(raw)
            except Exception:
                parsed = {"claimStance": "Not Support", "climateChangeStance": "Neutral"}

            records.append({
                "persona_id": prow.get("PersonaID"),
                "belief_climate_exists": prow.get("Belief_ClimateExists"),
                "claim_id": c["claim_id"],
                "claim": c["claim_text"],
                "claim_stance_label": c.get("claim_stance_label"),
                "llm_responses": {
                    "claimStance": parsed["claimStance"],
                    "climateChangeStance": parsed["climateChangeStance"]
                },
                "raw": raw  # keep for audit/debug; remove if you prefer smaller files
            })

    # Write ONE JSON file (array)
    out_path = Path("outputs/agent_claim_outputs.json")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(records)} records -> {out_path}")

if __name__ == "__main__":
    main()


Personas: 100%|██████████| 1/1 [00:10<00:00, 10.63s/it]

Saved 2 records -> outputs/agent_claim_outputs.json



