# captions

In [None]:
import pandas as pd
def count_reasoning(x):
    return x.count("[REASONING]")

captions = pd.read_csv("../code/caption_analysis.csv")[["id", "wrong_caption","legal_interference","wrong_legal"]]

df1 = pd.concat([
    pd.read_csv("../results/vqa/vqa_strat_7_split_3.csv"), 
    pd.read_csv("../results/vqa/vqa_strat_7_split_4.csv")], ignore_index=True)[['id', "exact_match", "output_prompt"]]
df1["output_prompt"] = df1["output_prompt"].apply(count_reasoning)
df1['type'] = 'caption_only'
df1 = pd.merge(df1, captions, on=["id"], how="inner")

df2 = pd.concat([
    pd.read_csv("../results/vqa/vqa_strat_8_split_3.csv"), 
    pd.read_csv("../results/vqa/vqa_strat_8_split_4.csv")], ignore_index=True)[['id', "exact_match", "output_prompt"]]
df2['type'] = 'image_only'
df2["output_prompt"] = df2["output_prompt"].apply(count_reasoning)
df2 = pd.merge(df2, captions, on=["id"], how="inner")

df3 = pd.concat([
    pd.read_csv("../results/vqa/vqa_strat_9_split_3.csv"), 
    pd.read_csv("../results/vqa/vqa_strat_9_split_4.csv")], ignore_index=True)[['id', "exact_match", "output_prompt"]]
df3['type'] = 'image_caption'
df3["output_prompt"] = df3["output_prompt"].apply(count_reasoning)
df3 = pd.merge(df3, captions, on=["id"], how="inner")

merged_df = pd.concat([df1, df2, df3], ignore_index=True)
print(merged_df)
# columns: id, exact_match, type, wrong_Caption, legal_interference, wrong_legal

In [None]:
grouped = (
    merged_df
    .groupby(["wrong_caption", "legal_interference", "wrong_legal", "type"])
    .agg(
        count=("id", "size"),
        exact_match_rate=("exact_match", lambda x: x.mean()),  # True counts as 1, False as 0
        reasoning_count=("output_prompt", lambda x: x.mean())
    )
    .reset_index()
)

grouped

In [None]:
import pandas as pd

# Load the CSV
path = "../code/caption_analysis.csv"
df = pd.read_csv(path)

# Group by the three flags and count
counts = df.groupby(["wrong_caption", "legal_interference", "wrong_legal"]).size().reset_index(name="count")

print(counts)


# judge

In [None]:
import pandas as pd

def load_merge_df():
    splits = ["1_train", "1_test", "2"]

    specs = [
        # (model_display_name, base_path_with_{split})
        ("o4-mini (best reasoning)", "../results/qa/qa_strat_6_split_{}.csv"),
        ("Mistral (worst)",     "../results/qa_vllm/qa_strat_4_split_{}_vllm.csv"),
        ("Gemma (best open)",         "../results/qa_vllm2/qa_strat_4_split_{}_vllm.csv"),
    ]

    frames = []

    for model_name, base in specs:
        for split in splits:
            base_path = base.format(split)
            df = pd.read_csv(base_path)

            sub = df[["id", "output_prompt", "exact_match", "input_prompt"]].copy()
            sub.columns = ["id", "output_prompt", "exact_match", "input_prompt"]
            sub.insert(0, "model", model_name)
            sub.insert(1, "split", split)

            frames.append(sub)

    agg = pd.concat(frames, ignore_index=True)
    return agg

In [None]:
import pandas as pd
import ast

def load_inputs():
    df1 = load_merge_df()
    qa_train = pd.read_csv("../data/dataset_V3/split_1_train.csv", index_col=False)
    qa_test  = pd.read_csv("../data/dataset_V3/split_1_test.csv", index_col=False)
    qa_test2  = pd.read_csv("../data/dataset_V3/split_2.csv", index_col=False)
    qa_refs = pd.concat([qa_train, qa_test, qa_test2], ignore_index=True)
    qa_refs = qa_refs[["id", "question", "answers" ,"explanation"]]
    merged_df = pd.merge(df1, qa_refs, on=["id"], how="inner")
    return merged_df

In [None]:
from langchain_openai import ChatOpenAI
reasoning = {
    "effort": "low",  # 'low', 'medium', or 'high'
    "summary": "detailed",  # 'detailed', 'auto', or None
}

llm_o4 = ChatOpenAI(model_name="o4-mini", api_key="", reasoning=reasoning, output_version="responses/v1")

In [None]:
from langchain_core.messages import HumanMessage
import asyncio
import pandas as pd
from tqdm import tqdm
import ast

llm_judge_prompt = """
Vei primi:
- O întrebare grilă (răspuns multiplu) din testele auto în limba română.
- Documentele legale folosite de o persoană pentru a răspunde.
- Explicația și răspunsul final oferit de acea persoană.
- Explicația oficială și răspunsul corect.

Scopul tău este să evaluezi următoarele criterii (True/False):
1. Există pretenții greșite sau parafrazări incorecte ale legii.
2. Sunt introduse excepții care nu există în lege.
3. Legea este aplicată unei categorii greșite de participanți la trafic.
4. Valorile numerice (distante, viteze, procente etc.) sunt interpretate greșit.
5. Este citat un articol sau paragraf care nu apare identic în referințe.
6. Explicația se contrazice în interiorul ei.
7. Se face o generalizare incorectă dintr-o evidență parțială.
8. Daca deși răspunsul final este corect, este posibil să fi ajuns acolo printr-o greșeală în raționament.

Instrucțiuni:
- Pentru fiecare criteriu, oferă o scurtă explicație (1–2 fraze).
- Marchează explicit <True> sau <False>.
- Răspunde STRICT în format XML, fără text suplimentar.

Iată detaliile de analizat:

<Intrebare>
{question}
</Intrebare>

<VarianteRaspuns>
{options_refs}
</VarianteRaspuns>

<VarianteCorecte>
{options_correct}
</VarianteCorecte>

<DocumenteReferinta>
{documents_ref}
</DocumenteReferinta>

<RaspunsOferit>
{answer}
</RaspunsOferit>

<RaspunsCorect>
{is_correct}
</RaspunsCorect>

<ExplicatieOficiala>
{answer_ref}
</ExplicatieOficiala>

Returnează rezultatul în structura:

<verdict>
  <criteriu id="1">
    <explicatie>...</explicatie>
    <valoare>True/False</valoare>
  </criteriu>
  <criteriu id="2">
    <explicatie>...</explicatie>
    <valoare>True/False</valoare>
  </criteriu>
  ...
  <criteriu id="8">
    <explicatie>...</explicatie>
    <valoare>True/False</valoare>
  </criteriu>
</verdict>
"""


MAX_CONCURRENT_REQUESTS = 15
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def judge(item):
    # print(item)
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]

    documents = item["input_prompt"]
    documents = documents.split("Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:")[1]
    documents = documents.split("===================")[0]
    input_prompt = llm_judge_prompt.format(
        question=item["question"],
        options_refs=answers,
        options_correct=correct_answers,
        documents_ref=documents,
        answer=item["output_prompt"],
        is_correct="Raspunsul final este corect" if item["exact_match"] else "Raspunsul final este incorect",
        answer_ref=item["explanation"]
    )
    print(input_prompt)
    message = HumanMessage(
        content=[
            {"type": "text", "text": input_prompt}
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']
    print(output_full)
    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"

    return input_prompt, output_full, output

async def mass_qa_runner(data: pd.DataFrame):
    tasks = []
    for real_idx, (idx, item) in tqdm(enumerate(data.iterrows())):
        # Create tasks for parallel processing
        tasks.append(judge(item))
    
    results = await asyncio.gather(*tasks)
    final_input_prompt = []
    final_output_prompt = []
    final_result = []

    for result, (idx, item) in zip(results, data.iterrows()):
        input_prompt, output_full, output = result
        final_input_prompt.append(input_prompt)
        final_output_prompt.append(output_full)
        final_result.append(output)
        print(output)

    data['judge_in'] = final_input_prompt
    data['judge_out_full'] = final_output_prompt
    data['judge_out'] = final_result

    return data

merged_df = load_inputs()
result_df = await mass_qa_runner(merged_df.head(1))

#### next

In [None]:
("o4-mini (best reasoning)", "../results/qa/qa_strat_6_split_{}"),
    ("Mistral (worst)",     "../results/qa_vllm/qa_strat_4_split_{}_vllm"),
    ("Gemma (best open)",         "../results/qa_vllm2/qa_strat_4_split_{}_vllm"),

In [None]:
# Single-cell script to read the 9 result DataFrames, normalize, and concatenate
import pandas as pd
from pathlib import Path
import json

# --- Config (edit here if paths change) ---
splits = ["1_train", "1_test", "2"]

specs = [
    # (model_display_name, base_path_with_{split})
    ("o4-mini (best reasoning)", "../results/qa/qa_strat_6_split_{}"),
    ("Mistral (worst)",     "../results/qa_vllm/qa_strat_4_split_{}_vllm"),
    ("Gemma (best open)",         "../results/qa_vllm2/qa_strat_4_split_{}_vllm"),
]

# --- Helpers ---
def _guess_file(path_str: str) -> Path:
    """
    Given a base path that may be a file without extension or a directory,
    try to resolve to a concrete file we can read.
    Tries common extensions and common filenames inside directories.
    """
    p = Path(path_str)
    # If path already points to a file, use it
    if p.is_file():
        return p

    # Try with common single-file extensions
    for ext in (".parquet", ".csv", ".jsonl", ".json"):
        q = Path(path_str + ext)
        if q.exists() and q.is_file():
            return q

    # If it's a directory, try common filenames; else try first matching file by extension
    if p.exists() and p.is_dir():
        preferred_names = [
            "predictions.parquet", "results.parquet", "df.parquet", "data.parquet",
            "predictions.csv", "results.csv", "df.csv", "data.csv",
            "predictions.jsonl", "results.jsonl", "df.jsonl", "data.jsonl",
            "predictions.json", "results.json", "df.json", "data.json",
        ]
        for name in preferred_names:
            q = p / name
            if q.exists() and q.is_file():
                return q
        # Fallback: first file by extension priority
        for ext in ("*.parquet", "*.csv", "*.jsonl", "*.json"):
            matches = sorted(p.glob(ext))
            if matches:
                return matches[0]

    raise FileNotFoundError(f"Could not resolve a file for base path: {path_str}")

def _read_df(resolved: Path) -> pd.DataFrame:
    if resolved.suffix == ".parquet":
        return pd.read_parquet(resolved)
    if resolved.suffix == ".csv":
        return pd.read_csv(resolved)
    if resolved.suffix in (".jsonl",):
        return pd.read_json(resolved, lines=True)
    if resolved.suffix in (".json",):
        # try records; if not, fallback to auto-infer
        try:
            with open(resolved, "r", encoding="utf-8") as f:
                data = json.load(f)
            if isinstance(data, list):
                return pd.DataFrame(data)
            # try nested under "data" or "results"
            for key in ("data", "results"):
                if key in data and isinstance(data[key], list):
                    return pd.DataFrame(data[key])
            # last resort
            return pd.json_normalize(data)
        except Exception:
            return pd.read_json(resolved)
    raise ValueError(f"Unsupported file type: {resolved}")

def _pick_col(df: pd.DataFrame, candidates) -> str:
    for c in candidates:
        if c in df.columns:
            return c
    # try case-insensitive
    lower = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in lower:
            return lower[c.lower()]
    raise KeyError(f"None of the candidate columns found: {candidates}\nAvailable: {list(df.columns)}")

# Candidate column names to normalize
ID_CANDS          = ["id", "example_id", "sample_id", "question_id", "qid", "row_id"]
OUTPUT_PROMPT_CANDS = ["output_prompt", "final_prompt", "prompt_out", "model_output", "output", "response", "answer"]
EXACT_MATCH_CANDS = ["exact_match", "em", "is_exact_match", "match_exact", "exact"]

# --- Load, normalize, concat ---
frames = []
provenance = []  # keep track for debugging

for model_name, base in specs:
    for split in splits:
        base_path = base.format(split)
        resolved = _guess_file(base_path)
        df = _read_df(resolved)

        id_col     = _pick_col(df, ID_CANDS)
        out_col    = _pick_col(df, OUTPUT_PROMPT_CANDS)
        exact_col  = _pick_col(df, EXACT_MATCH_CANDS)

        sub = df[[id_col, out_col, exact_col]].copy()
        sub.columns = ["id", "output_prompt", "exact_match"]
        sub.insert(0, "model", model_name)
        sub.insert(1, "split", split)

        frames.append(sub)
        provenance.append({"model": model_name, "split": split, "path": str(resolved), "rows": len(sub)})

# Final aggregated DataFrame
agg = pd.concat(frames, ignore_index=True)

# Ensure exact_match is boolean if possible
if agg["exact_match"].dtype == object:
    # try to coerce strings like "true"/"false"/"1"/"0"
    agg["exact_match"] = agg["exact_match"].map(
        lambda x: True if str(x).strip().lower() in {"1","true","t","yes","y"} 
        else False if str(x).strip().lower() in {"0","false","f","no","n"} 
        else x
    )

# Reorder columns exactly as requested
agg = agg[["model", "split", "id", "output_prompt", "exact_match"]]

# Display a brief summary and show the first few rows for sanity check (comment out if unwanted)
print("Loaded pieces:")
for p in provenance:
    print(f"- {p['model']} | {p['split']} | {p['rows']} rows | {p['path']}")
print("\nAggregated shape:", agg.shape)
display(agg.head(10))

# If you want to save the result, uncomment:
# agg.to_parquet("./results/qa_aggregated.parquet", index=False)
agg.to_csv("../citations-analytics.csv", index=False)


In [None]:
# Single-cell: add `citations_number` by counting specified regex occurrences in `output_prompt` (case-insensitive)

import re
import pandas as pd

# --- Preconditions ---
if "agg" not in globals():
    raise NameError("Expected a DataFrame named `agg` in memory from the previous step.")
if "output_prompt" not in agg.columns:
    raise KeyError("`agg` must have an `output_prompt` column.")

# Make sure output_prompt is string
texts = agg["output_prompt"].fillna("").astype(str)

# --- Define the patterns exactly as requested (case-insensitive) ---
# For the {number} placeholder we use (\d+). Hyphens and spaces are matched literally as provided.
patterns = [
    r"\bRegulament-(\d+)\b",
    r"\bReg-(\d+)\b",
    r"\bReg\.\s*(\d+)\b",
    r"\breg\.-\s*(\d+)\b",        # covers "reg.-{number}"
    r"\bRegulament\s+(\d+)\b",
    r"\breg\.\s*(\d+)\b",
    r"\bRegulamentul-(\d+)\b",
    r"\bRegulamentul\s+(\d+)\b",

    r"\boug-(\d+)\b",
    r"\boug\s+(\d+)\b",
    r"ordonanței de urgen",       # literal substring, as provided

    r"\bITP-(\d+)\b",

    r"\bRCA-(\d+)\b",

    r"\bPENAL-(\d+)\b",

    r"\bCodul\s+penal\b",
    r"\bCod\s+penal\b",
]

compiled = [re.compile(p, flags=re.IGNORECASE | re.UNICODE) for p in patterns]

def count_all_patterns(text: str) -> int:
    total = 0
    for rx in compiled:
        # Use finditer to count non-overlapping matches of each pattern
        total += sum(1 for _ in rx.finditer(text))
    return total

agg["citations_number"] = texts.apply(count_all_patterns)

# Optional: quick peek
print("Added `citations_number`. Example rows:")
display(agg[["model", "split", "id", "output_prompt", "citations_number"]].head(10))
agg.to_csv("../citations-analytics-number.csv", index=False)


In [None]:
import pandas as pd
import ast

# --- Helpers ---
def _parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return []

def read_ir(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, usecols=["id","retrieved_documents"])
    df["id"] = df["id"].astype(str)
    df["retrieved_documents"] = df["retrieved_documents"].apply(_parse_list)
    return df

# --- 1. Citations number ---
cit_num = pd.read_csv(
    "../citations-analytics-number.csv",
    usecols=["id","citations_number"]
)
cit_num["id"] = cit_num["id"].astype(str)

# --- 2. Citations list ---
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)
cit_list["id"] = cit_list["id"].astype(str)
cit_list["citations"] = cit_list["citations_list_llm"].apply(_parse_list)
cit_list = cit_list[["id","citations"]]

# --- 3. Merge both citation files ---
cit = cit_num.merge(cit_list, on="id", how="inner")

# --- 4. Read IR refs ---
ir_train = read_ir("../results/ir/ir_strat_6_train.csv")
ir_test  = read_ir("../results/ir/ir_strat_6_test.csv")
# ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)

# --- 5. Merge citations with refs ---
cit = cit.merge(ir_refs, on="id", how="inner")

# --- 6. Merge with agg (only id, model, split, exact_match) ---
agg_ids = agg[["id","model","split","exact_match"]].copy()
agg_ids["id"] = agg_ids["id"].astype(str)
cit = cit.merge(agg_ids, on="id", how="left")

# --- 7. Keep only split_1 ---
cit = cit[cit["split"].isin(["1_test"])].copy()

cit = cit[cit["model"] == "o4-mini (best reasoning)"]

# --- 8. Mark incorrect citations ---
def has_incorrect(citations, refs):
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

cit["incorrect_citation"] = cit.apply(
    lambda row: has_incorrect(row["citations"], row["retrieved_documents"]), axis=1
)

# --- 9. Aggregate report ---
stats = (
    cit.groupby(["model","split","citations_number"])
       .agg(
           correct=("exact_match", lambda x: (x==True).sum()),
           total=("exact_match","size"),
           incorrect_citations=("incorrect_citation", lambda x: (x==True).sum())
       )
       .reset_index()
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]

# --- Save / Display table ---
stats_sorted = stats.sort_values(["model","split","citations_number"]).reset_index(drop=True)

print("Aggregated report per model/split/citations_number:")
display(stats_sorted)

# Save to CSV if needed
out_path = "../results/citations_accuracy_report.csv"
stats_sorted.to_csv(out_path, index=False)
print(f"Saved table to {out_path}")


# Plot: accuracy lines + % incorrect citation bars per model, grouped by citations_number and split

import matplotlib.pyplot as plt
import numpy as np

# --- Preconditions ---
req_cols = {"model","split","citations_number","accuracy","pct_incorrect"}
if "stats" not in globals():
    raise NameError("Expected a DataFrame named `stats` from the previous step.")
missing = req_cols - set(stats.columns)
if missing:
    raise KeyError(f"`stats` is missing columns: {missing}")

models = stats["model"].unique()
n_models = len(models)

fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 5), sharey=False)
if n_models == 1:
    axes = [axes]

for ax, model in zip(axes, models):
    sub = stats[stats["model"] == model].copy()
    # consistent x order
    x_vals = sorted(sub["citations_number"].unique())
    x = np.array(x_vals, dtype=float)

    # Draw % incorrect bars per split, offset left/right to avoid overlap
    splits = list(sub["split"].unique())
    n_splits = len(splits)
    bw = 0.35 / max(n_splits, 1)  # bar width per split
    offsets = np.linspace(-bw*(n_splits-1)/2, bw*(n_splits-1)/2, n_splits) if n_splits > 1 else [0.0]

    for off, split in zip(offsets, splits):
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_bars = [ssplit.loc[k, "pct_incorrect"] if k in ssplit.index else 0 for k in x_vals]
        ax.bar(x + off, y_bars, width=bw, alpha=0.4, label=f"{split} (% incorrect)")

    # Draw accuracy lines per split
    for split in splits:
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_line = [ssplit.loc[k, "accuracy"] if k in ssplit.index else np.nan for k in x_vals]
        ax.plot(x, y_line, marker="o", label=f"{split} (accuracy)")

    ax.set_title(model)
    ax.set_xlabel("Citations number")
    ax.set_ylabel("Accuracy (lines) / % Incorrect (bars)")
    ax.set_xticks(x_vals)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()

plt.suptitle("Accuracy vs Citations Count per Model & Split\n(+ % Incorrect Citations)", y=1.02)
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import ast

# --- Helpers ---
def _parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return []

def read_ir(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, usecols=["id","retrieved_documents"])
    df["id"] = df["id"].astype(str)
    df["retrieved_documents"] = df["retrieved_documents"].apply(_parse_list)
    return df

# --- 1. Citations number ---
cit_num = pd.read_csv(
    "../citations-analytics-number.csv",
    usecols=["id","citations_number"]
)
cit_num["id"] = cit_num["id"].astype(str)

# --- 2. Citations list ---
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)
cit_list["id"] = cit_list["id"].astype(str)
cit_list["citations"] = cit_list["citations_list_llm"].apply(_parse_list)
cit_list = cit_list[["id","citations"]]

# --- 3. Merge both citation files ---
cit = cit_num.merge(cit_list, on="id", how="inner")

# --- 4. Read IR refs ---
ir_train = read_ir("../results/ir/ir_strat_6_train.csv")
ir_test  = read_ir("../results/ir/ir_strat_6_test.csv")
ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)

# --- 5. Merge citations with refs ---
cit = cit.merge(ir_refs, on="id", how="inner")

# --- 6. Merge with agg (only id, model, split, exact_match) ---
agg_ids = agg[["id","model","split","exact_match"]].copy()
agg_ids["id"] = agg_ids["id"].astype(str)
cit = cit.merge(agg_ids, on="id", how="left")

# --- 7. Keep only split_1 ---
cit = cit[cit["split"].isin(["1_train"])].copy()

cit = cit[cit["model"] == "o4-mini (best reasoning)"]

# --- 8. Mark incorrect citations ---
def has_incorrect(citations, refs):
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

cit["incorrect_citation"] = cit.apply(
    lambda row: has_incorrect(row["citations"], row["retrieved_documents"]), axis=1
)

# --- 9. Aggregate report ---
stats = (
    cit.groupby(["model","split","citations_number"])
       .agg(
           correct=("exact_match", lambda x: (x==True).sum()),
           total=("exact_match","size"),
           incorrect_citations=("incorrect_citation", lambda x: (x==True).sum())
       )
       .reset_index()
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]

# --- Save / Display table ---
stats_sorted = stats.sort_values(["model","split","citations_number"]).reset_index(drop=True)

print("Aggregated report per model/split/citations_number:")
display(stats_sorted)

# Save to CSV if needed
out_path = "../results/citations_accuracy_report.csv"
stats_sorted.to_csv(out_path, index=False)
print(f"Saved table to {out_path}")


# Plot: accuracy lines + % incorrect citation bars per model, grouped by citations_number and split

import matplotlib.pyplot as plt
import numpy as np

# --- Preconditions ---
req_cols = {"model","split","citations_number","accuracy","pct_incorrect"}
if "stats" not in globals():
    raise NameError("Expected a DataFrame named `stats` from the previous step.")
missing = req_cols - set(stats.columns)
if missing:
    raise KeyError(f"`stats` is missing columns: {missing}")

models = stats["model"].unique()
n_models = len(models)

fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 5), sharey=False)
if n_models == 1:
    axes = [axes]

for ax, model in zip(axes, models):
    sub = stats[stats["model"] == model].copy()
    # consistent x order
    x_vals = sorted(sub["citations_number"].unique())
    x = np.array(x_vals, dtype=float)

    # Draw % incorrect bars per split, offset left/right to avoid overlap
    splits = list(sub["split"].unique())
    n_splits = len(splits)
    bw = 0.35 / max(n_splits, 1)  # bar width per split
    offsets = np.linspace(-bw*(n_splits-1)/2, bw*(n_splits-1)/2, n_splits) if n_splits > 1 else [0.0]

    for off, split in zip(offsets, splits):
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_bars = [ssplit.loc[k, "pct_incorrect"] if k in ssplit.index else 0 for k in x_vals]
        ax.bar(x + off, y_bars, width=bw, alpha=0.4, label=f"{split} (% incorrect)")

    # Draw accuracy lines per split
    for split in splits:
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_line = [ssplit.loc[k, "accuracy"] if k in ssplit.index else np.nan for k in x_vals]
        ax.plot(x, y_line, marker="o", label=f"{split} (accuracy)")

    ax.set_title(model)
    ax.set_xlabel("Citations number")
    ax.set_ylabel("Accuracy (lines) / % Incorrect (bars)")
    ax.set_xticks(x_vals)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()

plt.suptitle("Accuracy vs Citations Count per Model & Split\n(+ % Incorrect Citations)", y=1.02)
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import ast

# --- Helpers ---
def _parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return []

def read_ir(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, usecols=["id","retrieved_documents"])
    df["id"] = df["id"].astype(str)
    df["retrieved_documents"] = df["retrieved_documents"].apply(_parse_list)
    return df

# --- 1. Citations number ---
cit_num = pd.read_csv(
    "../citations-analytics-number.csv",
    usecols=["id","citations_number"]
)
cit_num["id"] = cit_num["id"].astype(str)

# --- 2. Citations list ---
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)
cit_list["id"] = cit_list["id"].astype(str)
cit_list["citations"] = cit_list["citations_list_llm"].apply(_parse_list)
cit_list = cit_list[["id","citations"]]

# --- 3. Merge both citation files ---
cit = cit_num.merge(cit_list, on="id", how="inner")

# --- 4. Read IR refs ---
ir_train = read_ir("../results/ir/ir_strat_6_train.csv")
ir_test  = read_ir("../results/ir/ir_strat_6_test.csv")
ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)

# --- 5. Merge citations with refs ---
cit = cit.merge(ir_refs, on="id", how="inner")

# --- 6. Merge with agg (only id, model, split, exact_match) ---
agg_ids = agg[["id","model","split","exact_match"]].copy()
agg_ids["id"] = agg_ids["id"].astype(str)
cit = cit.merge(agg_ids, on="id", how="left")

# --- 7. Keep only split_1 ---
cit = cit[cit["split"].isin(["1_test"])].copy()

cit = cit[cit["model"] == "Gemma (best open)"]

# --- 8. Mark incorrect citations ---
def has_incorrect(citations, refs):
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

cit["incorrect_citation"] = cit.apply(
    lambda row: has_incorrect(row["citations"], row["retrieved_documents"]), axis=1
)

# --- 9. Aggregate report ---
stats = (
    cit.groupby(["model","split","citations_number"])
       .agg(
           correct=("exact_match", lambda x: (x==True).sum()),
           total=("exact_match","size"),
           incorrect_citations=("incorrect_citation", lambda x: (x==True).sum())
       )
       .reset_index()
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]

# --- Save / Display table ---
stats_sorted = stats.sort_values(["model","split","citations_number"]).reset_index(drop=True)

print("Aggregated report per model/split/citations_number:")
display(stats_sorted)

# Save to CSV if needed
out_path = "../results/citations_accuracy_report.csv"
stats_sorted.to_csv(out_path, index=False)
print(f"Saved table to {out_path}")


# Plot: accuracy lines + % incorrect citation bars per model, grouped by citations_number and split

import matplotlib.pyplot as plt
import numpy as np

# --- Preconditions ---
req_cols = {"model","split","citations_number","accuracy","pct_incorrect"}
if "stats" not in globals():
    raise NameError("Expected a DataFrame named `stats` from the previous step.")
missing = req_cols - set(stats.columns)
if missing:
    raise KeyError(f"`stats` is missing columns: {missing}")

models = stats["model"].unique()
n_models = len(models)

fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 5), sharey=False)
if n_models == 1:
    axes = [axes]

for ax, model in zip(axes, models):
    sub = stats[stats["model"] == model].copy()
    # consistent x order
    x_vals = sorted(sub["citations_number"].unique())
    x = np.array(x_vals, dtype=float)

    # Draw % incorrect bars per split, offset left/right to avoid overlap
    splits = list(sub["split"].unique())
    n_splits = len(splits)
    bw = 0.35 / max(n_splits, 1)  # bar width per split
    offsets = np.linspace(-bw*(n_splits-1)/2, bw*(n_splits-1)/2, n_splits) if n_splits > 1 else [0.0]

    for off, split in zip(offsets, splits):
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_bars = [ssplit.loc[k, "pct_incorrect"] if k in ssplit.index else 0 for k in x_vals]
        ax.bar(x + off, y_bars, width=bw, alpha=0.4, label=f"{split} (% incorrect)")

    # Draw accuracy lines per split
    for split in splits:
        ssplit = sub[sub["split"] == split].set_index("citations_number")
        y_line = [ssplit.loc[k, "accuracy"] if k in ssplit.index else np.nan for k in x_vals]
        ax.plot(x, y_line, marker="o", label=f"{split} (accuracy)")

    ax.set_title(model)
    ax.set_xlabel("Citations number")
    ax.set_ylabel("Accuracy (lines) / % Incorrect (bars)")
    ax.set_xticks(x_vals)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()

plt.suptitle("Accuracy vs Citations Count per Model & Split\n(+ % Incorrect Citations)", y=1.02)
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

# --- Helpers ---
def _parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return []

def read_ir(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, usecols=["id","retrieved_documents"])
    df["id"] = df["id"].astype(str)
    df["retrieved_documents"] = df["retrieved_documents"].apply(_parse_list)
    return df

# --- Select model and split early ---
target_model = "o4-mini (best reasoning)"
target_split = "1_test"

agg_sub = agg[(agg["model"] == target_model) & (agg["split"] == target_split)].copy()
agg_sub["id"] = agg_sub["id"].astype(str)

# --- 1. Citations number ---
cit_num = pd.read_csv(
    "../citations-analytics-number.csv",
    usecols=["id","citations_number"]
)
cit_num["id"] = cit_num["id"].astype(str)
cit_num = cit_num[cit_num["id"].isin(agg_sub["id"])]

# --- 2. Citations list ---
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)
cit_list["id"] = cit_list["id"].astype(str)
cit_list["citations"] = cit_list["citations_list_llm"].apply(_parse_list)
cit_list = cit_list[["id","citations"]]
cit_list = cit_list[cit_list["id"].isin(agg_sub["id"])]

# --- 3. Merge both citation files ---
cit = cit_num.merge(cit_list, on="id", how="inner")

# --- 4. Read IR refs ---
ir_train = read_ir("../results/ir/ir_strat_6_train.csv")
ir_test  = read_ir("../results/ir/ir_strat_6_test.csv")
ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)

# --- 5. Merge citations with refs ---
cit = cit.merge(ir_refs, on="id", how="inner")

# --- 6. Merge with filtered agg ---
cit = cit.merge(agg_sub[["id","model","split","exact_match"]], on="id", how="left")

# --- 7. Mark incorrect citations ---
def has_incorrect(citations, refs):
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

cit["incorrect_citation"] = cit.apply(
    lambda row: has_incorrect(row["citations"], row["retrieved_documents"]), axis=1
)

# --- 8. Aggregate report ---
stats = (
    cit.groupby(["model","split","citations_number"])
       .agg(
           correct=("exact_match", lambda x: (x==True).sum()),
           total=("exact_match","size"),
           incorrect_citations=("incorrect_citation", lambda x: (x==True).sum())
       )
       .reset_index()
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]

print("Aggregated report:")
display(stats.sort_values(["citations_number"]).reset_index(drop=True))

# --- 9. Plot ---
fig, ax = plt.subplots(figsize=(7,5))

sub = stats
x_vals = sorted(sub["citations_number"].unique())
x = np.array(x_vals, dtype=float)

# Bars: % incorrect
y_bars = [sub.set_index("citations_number").loc[k, "pct_incorrect"] for k in x_vals]
ax.bar(x - 0.15, y_bars, width=0.3, alpha=0.4, label="Citation hallucination rate")

# Line: accuracy
y_line = [sub.set_index("citations_number").loc[k, "accuracy"] for k in x_vals]
ax.plot(x, y_line, marker="o", color="C1", label="Accuracy")

ax.set_title(f"{target_model} | {target_split}")
ax.set_xlabel("Citations number")
ax.set_ylabel("Rate")
ax.set_xticks(x_vals)
ax.grid(True, linestyle="--", alpha=0.6)
ax.legend()

plt.suptitle("Accuracy vs Citation Hallucination Rate", y=1.02)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import ast
df3 = pd.read_csv("../citations-analytics-list-llm.csv", index_col=False)[["model", "split", "output_prompt", "citations_list_llm"]]
df3 = df3[df3["citations_list_llm"].apply(ast.literal_eval).apply(len) > 0]

"""
"OUG-X"
"OUG X "
"Regulament-X"
"Regulament X "
"Regulamentul-X"
"Regulamentul X "
"ITP-X"
"ITP X "
"RCA-X"
"RCA X "
"PENAL-X"
"PENAL X"
"""

import re

# number = digits with optional .segments, but no slashes
number_part = r'\d+(?:\.\d+)*'

# main regex
pattern = re.compile(
    rf'\b(?:OUG|Regulamentul|Regulament|ITP|RCA|PENAL)[ -]{number_part}\b',
    flags=re.IGNORECASE
)

# True if NO valid citation pattern is found in output_prompt
df3['wrong_cit_format'] = ~df3['output_prompt'].astype(str).str.contains(pattern, na=False)

stats = (
    df3.groupby(["model", "split"])
       .agg(
           total=("wrong_cit_format", "size"),
           wrong=("wrong_cit_format", "sum")
       )
       .assign(
           wrong_pct=lambda d: 100 * d["wrong"] / d["total"]
       )
       .reset_index()
)

print(stats.round(2).to_latex(
    index=False,          # don't print row indices
    escape=False,         # allow special chars like model names with ()
    column_format="l l r r r",  # adjust alignment: l=left, r=right
    caption="Wrong citation format per moel & split",
    label="tab:wrong_citation_format"
))


### hallucination/acc rate


In [None]:
import pandas as pd
import ast

# Load both CSVs
df1 = pd.read_csv("../citations-analytics.csv", index_col=False)
df2 = pd.read_csv("../citations-analytics-number.csv", index_col=False)[["model", "split", "id", "citations_number"]]
df3 = pd.read_csv("../citations-analytics-list-llm.csv", index_col=False)[["model", "split", "id", "citations_list_llm"]]

ir_train = pd.read_csv("../results/ir/ir_strat_6_train.csv", index_col=False)
ir_test  = pd.read_csv("../results/ir/ir_strat_6_test.csv", index_col=False)

ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)
ir_refs = ir_refs[["id", "retrieved_documents"]]

# Merge on model, split, id
merged_df = pd.merge(df1, df2, on=["model", "split", "id"], how="inner")
merged_df = pd.merge(merged_df, df3, on=["model", "split", "id"], how="inner")
merged_df = pd.merge(merged_df, ir_refs, on=["id"], how="inner")


def has_incorrect(citations, refs):
    refs = ast.literal_eval(refs)
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

def deduplicate(citations):
    return list(set(ast.literal_eval(citations)))

def transform_len(citations):
    return len(citations)

merged_df['citations_list_llm'] = merged_df.apply(
    lambda row: deduplicate(row['citations_list_llm']), axis=1
)

merged_df["incorrect_citation"] = merged_df.apply(
    lambda row: has_incorrect(row["citations_list_llm"], row["retrieved_documents"]), axis=1
)

merged_df["citations_number"] = merged_df.apply(
    lambda row: transform_len(row["citations_list_llm"]), axis=1
)

# Show head
merged_df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1) Aggregate report from merged_df ---
# Requires: merged_df has columns: model, split, citations_number, incorrect_citation, exact_match
needed = {"model","split","citations_number","incorrect_citation","exact_match"}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"'merged_df' is missing required columns: {missing}")

stats = (
    merged_df
    .groupby(["model", "split", "citations_number"], as_index=False)
    .agg(
        correct=("exact_match", lambda x: (x == True).sum()),
        total=("exact_match", "size"),
        incorrect_citations=("incorrect_citation", lambda x: (x == True).sum()),
    )
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]
stats = stats[stats["total"] > 2]

print(stats[["model","split","citations_number", "total", "accuracy", "pct_incorrect"]][(stats["model"] == "Gemma (best open)") & (stats["split"] == "1_test")].to_markdown())
print(stats[["model","split","citations_number", "total","accuracy", "pct_incorrect"]][(stats["model"] == "Mistral (worst)") & (stats["split"] == "1_test")].to_markdown())
print(stats[["model","split","citations_number", "total", "accuracy","pct_incorrect"]][(stats["model"] == "o4-mini (best reasoning)") & (stats["split"] == "1_test")].to_markdown())



# print("Aggregated report:")
# print(stats)
# display(stats.sort_values(["model","split","citations_number"]).reset_index(drop=True))

# --- 2) Plot per (model, split): bars = pct_incorrect, line = accuracy ---
def plot_model_split(sub_stats, target_model, target_split):
    sub = sub_stats[(sub_stats["model"] == target_model) & (sub_stats["split"] == target_split)]
    if sub.empty:
        return

    x_vals = sorted(sub["citations_number"].unique())
    x = np.array(x_vals, dtype=float)
    sub_idx = sub.set_index("citations_number")

    # Bars: % incorrect
    y_bars = [float(sub_idx.loc[k, "pct_incorrect"]) for k in x_vals]

    # Line: accuracy
    y_line = [float(sub_idx.loc[k, "accuracy"]) for k in x_vals]

    fig, ax = plt.subplots(figsize=(7, 5))
    ax.bar(x - 0.15, y_bars, width=0.3, alpha=0.4, label="Citation hallucination rate")
    ax.plot(x, y_line, marker="o", label="Accuracy")

    ax.set_title(f"{target_model} | {target_split}")
    ax.set_xlabel("Citations number")
    ax.set_ylabel("Rate")
    ax.set_xticks(x_vals)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()

    plt.suptitle("Accuracy vs Citation Hallucination Rate", y=1.02)
    plt.tight_layout()
    plt.show()

# Render plots for all (model, split) combos present
for m in sorted(stats["model"].unique()):
    for s in sorted(stats["split"].unique()):
        plot_model_split(stats, m, s)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1) Aggregate report from merged_df ---
needed = {"model", "split", "citations_number", "incorrect_citation", "exact_match"}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"'merged_df' is missing required columns: {missing}")

stats = (
    merged_df
    .groupby(["model", "split", "citations_number"], as_index=False)
    .agg(
        correct=("exact_match", lambda x: (x == True).sum()),
        total=("exact_match", "size"),
        incorrect_citations=("incorrect_citation", lambda x: (x == True).sum()),
    )
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]
stats = stats[stats["total"] > 2]

# --- 2) Combined plot ---
fig, ax = plt.subplots(figsize=(10, 6))

# consistent color per MODEL (not per split)
models = stats["model"].unique().tolist()
color_map = {m: c for m, c in zip(models, plt.cm.tab10.colors)}

# style maps for splits
bar_hatch_map = {"1_train": None, "1_test": "//"}
line_style_map = {"1_train": "-", "1_test": "--"}

groups = list(stats.groupby(["model", "split"]))
n_groups = len(groups)
width = 0.12  # bar width / horizontal offset step

for i, ((model, split), sub) in enumerate(groups):
    sub = sub.sort_values("citations_number")
    x_vals = sub["citations_number"].to_numpy(dtype=float)
    y_halluc = sub["pct_incorrect"].to_numpy(dtype=float)
    y_acc = sub["accuracy"].to_numpy(dtype=float)

    # center the grouped bars around each x by offsetting per (model, split)
    offset = (i - (n_groups - 1) / 2) * width

    # Bars: hallucination rate
    ax.bar(
        x_vals + offset,
        y_halluc,
        width=width,
        alpha=0.45,
        color=color_map[model],
        hatch=bar_hatch_map.get(split, None),
        label=f"{model} | {split} (Halluc.)"
    )

    # Line: accuracy
    ax.plot(
        x_vals,
        y_acc,
        marker="o",
        linewidth=1.8,
        color=color_map[model],
        linestyle=line_style_map.get(split, "-"),
        label=f"{model} | {split} (Acc.)"
    )

ax.set_title("Accuracy vs Citation Hallucination Rate (all models/splits)")
ax.set_xlabel("Citations number")
ax.set_ylabel("Rate")
ax.set_xticks(sorted(stats["citations_number"].unique()))
ax.grid(True, linestyle="--", alpha=0.6)
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.savefig('../plots/new_plots/acc_vs_hallucination.pdf', bbox_inches='tight')
plt.show()
plt.close()


In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

# --- Helpers ---
def _parse_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return []

def read_ir(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, usecols=["id","retrieved_documents"])
    df["id"] = df["id"].astype(str)
    df["retrieved_documents"] = df["retrieved_documents"].apply(_parse_list)
    return df

# --- Select model and split early ---
target_model = "Gemma (best open)"
target_split = "1_test"

agg_sub = agg[(agg["model"] == target_model) & (agg["split"] == target_split)].copy()
agg_sub["id"] = agg_sub["id"].astype(str)

# --- 1. Citations number ---
cit_num = pd.read_csv(
    "../citations-analytics-number.csv",
    usecols=["id","citations_number"]
)
cit_num["id"] = cit_num["id"].astype(str)
cit_num = cit_num[cit_num["id"].isin(agg_sub["id"])]

# --- 2. Citations list ---
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)
cit_list["id"] = cit_list["id"].astype(str)
cit_list["citations"] = cit_list["citations_list_llm"].apply(_parse_list)
cit_list = cit_list[["id","citations"]]
cit_list = cit_list[cit_list["id"].isin(agg_sub["id"])]

# --- 3. Merge both citation files ---
cit = cit_num.merge(cit_list, on="id", how="inner")

# --- 4. Read IR refs ---
ir_train = read_ir("../results/ir/ir_strat_6_train.csv")
ir_test  = read_ir("../results/ir/ir_strat_6_test.csv")
ir_refs = pd.concat([ir_train, ir_test], ignore_index=True)

# --- 5. Merge citations with refs ---
cit = cit.merge(ir_refs, on="id", how="inner")

# --- 6. Merge with filtered agg ---
cit = cit.merge(agg_sub[["id","model","split","exact_match"]], on="id", how="left")

# --- 7. Mark incorrect citations ---
def has_incorrect(citations, refs):
    if not citations:
        return False
    ref_set = set(refs)
    return any(c not in ref_set for c in citations)

cit["incorrect_citation"] = cit.apply(
    lambda row: has_incorrect(row["citations"], row["retrieved_documents"]), axis=1
)

# --- 8. Aggregate report ---
stats = (
    cit.groupby(["model","split","citations_number"])
       .agg(
           correct=("exact_match", lambda x: (x==True).sum()),
           total=("exact_match","size"),
           incorrect_citations=("incorrect_citation", lambda x: (x==True).sum())
       )
       .reset_index()
)
stats["accuracy"] = stats["correct"] / stats["total"]
stats["pct_incorrect"] = stats["incorrect_citations"] / stats["total"]

print("Aggregated report:")
display(stats.sort_values(["citations_number"]).reset_index(drop=True))

# --- 9. Plot ---
fig, ax = plt.subplots(figsize=(7,5))

sub = stats
x_vals = sorted(sub["citations_number"].unique())
x = np.array(x_vals, dtype=float)

# Bars: % incorrect
y_bars = [sub.set_index("citations_number").loc[k, "pct_incorrect"] for k in x_vals]
ax.bar(x - 0.15, y_bars, width=0.3, alpha=0.4, label="Citation hallucination rate")

# Line: accuracy
y_line = [sub.set_index("citations_number").loc[k, "accuracy"] for k in x_vals]
ax.plot(x, y_line, marker="o", color="C1", label="Accuracy")

ax.set_title(f"{target_model} | {target_split}")
ax.set_xlabel("Citations number")
ax.set_ylabel("Rate")
ax.set_xticks(x_vals)
ax.grid(True, linestyle="--", alpha=0.6)
ax.legend()

plt.suptitle("Accuracy vs Citation Hallucination Rate", y=1.02)
plt.tight_layout()
plt.show()


In [None]:
# read "../citations-analytics.csv", index=False, show head

In [None]:
cit_list = pd.read_csv(
    "../citations-analytics-list-llm.csv",
    usecols=["id","citations_list_llm"]
)

splits = ["1_train", "1_test", "2"]

specs = [
    # (model_display_name, base_path_with_{split})
    ("o4-mini (best reasoning)", "../results/qa/qa_strat_6_split_{}.csv"),
    ("Mistral (worst)",          "../results/qa_vllm/qa_strat_4_split_{}_vllm.csv"),
    ("Gemma (best open)",        "../results/qa_vllm2/qa_strat_4_split_{}_vllm.csv"),
]

In [None]:
# Single-cell script to read the 9 result DataFrames, normalize, and concatenate
import ast
import pandas as pd
from pathlib import Path

# --- Config (edit here if paths change) ---
splits = ["1_train", "1_test", "2"]

specs = [
    # (model_display_name, base_path_with_{split})
    ("o4-mini (best reasoning)", "../results/qa/qa_strat_6_split_{}.csv"),
    ("Mistral (worst)",          "../results/qa_vllm/qa_strat_4_split_{}_vllm.csv"),
    ("Gemma (best open)",        "../results/qa_vllm2/qa_strat_4_split_{}_vllm.csv"),
]

# --- Helpers ---
VALID_CHOICES = {"A", "B", "C"}

def parse_qa_result(cell):
    """
    Parse the qa_result cell using ast.literal_eval.
    Returns (parsed_list_or_None, wrong_format_bool).
    wrong_format is True if parsing fails, the parsed value is not a list,
    or any element is not exactly one of 'A', 'B', 'C'.
    """
    # Missing -> wrong
    if pd.isna(cell):
        return None, True
    try:
        parsed = ast.literal_eval(cell) if isinstance(cell, str) else cell
    except Exception:
        return None, True

    # Must be a list of strings strictly in {'A','B','C'}
    if not isinstance(parsed, (list, tuple)):
        return None, True

    # Normalize to list[str]
    wrong = False
    out = []
    for x in parsed:
        # Cast only if it's a simple string-like; otherwise mark wrong.
        if isinstance(x, str):
            choice = x.strip()
        else:
            # allow single-letter bytes etc.
            try:
                choice = str(x).strip()
            except Exception:
                wrong = True
                continue

        out.append(choice)
        if choice not in VALID_CHOICES:
            wrong = True

    return out, wrong

# --- Load, normalize, concat ---
frames = []
missing_files = []

for model_name, tmpl in specs:
    for split in splits:
        path = Path(tmpl.format(split))
        if not path.exists():
            missing_files.append(str(path))
            continue

        df = pd.read_csv(path)
        # Ensure qa_result column exists
        if "qa_result" not in df.columns:
            # Create placeholder to keep schema consistent, mark all wrong
            df["qa_result"] = None

        # Parse/validate qa_result
        parsed, wrong = zip(*[parse_qa_result(v) for v in df["qa_result"]])
        df["qa_result_parsed"] = list(parsed)
        df["wrong_format"] = list(wrong)

        # Normalize essential metadata columns
        df["model"] = model_name
        df["split"] = split
        df["source_path"] = str(path)

        # Optional: move common columns to front if present
        front_cols = [c for c in ["model", "split", "source_path", "qa_result", "qa_result_parsed", "wrong_format"] if c in df.columns]
        other_cols = [c for c in df.columns if c not in front_cols]
        df = df[front_cols + other_cols]

        frames.append(df)

# Concatenate all available frames
if frames:
    all_df = pd.concat(frames, ignore_index=True)
else:
    all_df = pd.DataFrame(columns=["model", "split", "source_path", "qa_result", "qa_result_parsed", "wrong_format"])

# --- Stats: incorrectness rate per model & split ---
if not all_df.empty:
    grp = (
        all_df.groupby(["model", "split"], dropna=False)["wrong_format"]
        .agg(total="size", wrong_count="sum")
        .reset_index()
    )
    grp["incorrectness_rate"] = grp["wrong_count"] / grp["total"] * 100
    grp = grp.sort_values(["model", "split"]).reset_index(drop=True)

    # Display results
    pd.set_option("display.max_rows", 200)
    print("=== Incorrectness rate per model & split ===")
    print(grp.to_string(index=False))
else:
    print("No data loaded. Check that the expected CSV files exist.")

# --- Optional: also print any missing files to help debugging ---
if missing_files:
    print("\nThe following expected files were not found:")
    for p in missing_files:
        print(" -", p)


In [None]:
OUG x/y,
Regulamentul x/y
Regulamentul 147/2003

In [None]:
latex_str = grp.round(2).to_latex(
    index=False,          # don't print row indices
    escape=False,         # allow special chars like model names with ()
    column_format="l l r r r",  # adjust alignment: l=left, r=right
    caption="Picked vs Reference distribution per model & split",
    label="tab:picked_ref_distribution"
)

print("\n=== LaTeX table ===\n")
print(latex_str)

In [None]:
# --- Bias analysis: distribution of A/B/C selections (robust) ---
if not all_df.empty:
    parsed_only = all_df.loc[~all_df["wrong_format"]].copy()
    exploded = parsed_only.explode("qa_result_parsed")
    exploded = exploded[exploded["qa_result_parsed"].isin(VALID_CHOICES)]

    if exploded.empty:
        print("\nNo valid A/B/C answers to summarize.")
    else:
        # Overall distribution across all models & splits
        overall_counts = exploded["qa_result_parsed"].value_counts().reindex(["A","B","C"], fill_value=0)
        overall_pct = overall_counts / overall_counts.sum() * 100
        print("\n=== Overall distribution of answers (all models/splits) ===")
        print(overall_pct.round(2).to_string())

        # Per model × split distribution
        counts = (
            exploded.groupby(["model", "split", "qa_result_parsed"])
            .size()
            .rename("n")
            .reset_index()
        )
        # totals per (model, split)
        counts["total"] = counts.groupby(["model", "split"])["n"].transform("sum")
        counts["percent"] = counts["n"] / counts["total"] * 100

        # Pivot to wide format, ensure A/B/C columns present
        grp_pivot = counts.pivot_table(
            index=["model", "split"],
            columns="qa_result_parsed",
            values="percent",
            fill_value=0,
            aggfunc="sum",
        ).reset_index()

        # Reorder A/B/C columns
        for col in ["A", "B", "C"]:
            if col not in grp_pivot.columns:
                grp_pivot[col] = 0.0
        grp_pivot = grp_pivot[["model", "split", "A", "B", "C"]]

        print("\n=== Distribution of answers per model & split (percent) ===")
        print(grp_pivot.round(2).to_string(index=False))
else:
    print("\nNo data loaded to analyze.")


In [None]:
# --- Distribution analysis for correct_answers ---
if not all_df.empty and "correct_answers" in all_df.columns:
    parsed_correct = all_df.copy()

    # Try to parse correct_answers if they look like lists in string form
    def parse_correct(cell):
        if pd.isna(cell):
            return None
        try:
            parsed = ast.literal_eval(cell) if isinstance(cell, str) else cell
        except Exception:
            return None
        if isinstance(parsed, (list, tuple)):
            return [str(x).strip() for x in parsed if str(x).strip() in VALID_CHOICES]
        if isinstance(parsed, str):
            return [parsed.strip()] if parsed.strip() in VALID_CHOICES else None
        return None

    parsed_correct["correct_answers_parsed"] = parsed_correct["correct_answers"].map(parse_correct)

    # Explode
    exploded_ca = parsed_correct.explode("correct_answers_parsed")
    exploded_ca = exploded_ca[exploded_ca["correct_answers_parsed"].isin(VALID_CHOICES)]

    if exploded_ca.empty:
        print("\nNo valid A/B/C values in correct_answers to summarize.")
    else:
        # Overall distribution
        overall_counts_ca = exploded_ca["correct_answers_parsed"].value_counts().reindex(["A","B","C"], fill_value=0)
        overall_pct_ca = overall_counts_ca / overall_counts_ca.sum() * 100
        print("\n=== Overall distribution of correct_answers (all models/splits) ===")
        print(overall_pct_ca.round(2).to_string())

        # Per model × split distribution
        counts_ca = (
            exploded_ca.groupby(["model", "split", "correct_answers_parsed"])
            .size()
            .rename("n")
            .reset_index()
        )
        counts_ca["total"] = counts_ca.groupby(["model", "split"])["n"].transform("sum")
        counts_ca["percent"] = counts_ca["n"] / counts_ca["total"] * 100

        grp_pivot_ca = counts_ca.pivot_table(
            index=["model", "split"],
            columns="correct_answers_parsed",
            values="percent",
            fill_value=0,
            aggfunc="sum",
        ).reset_index()

        # Reorder A/B/C columns
        for col in ["A", "B", "C"]:
            if col not in grp_pivot_ca.columns:
                grp_pivot_ca[col] = 0.0
        grp_pivot_ca = grp_pivot_ca[["model", "split", "A", "B", "C"]]

        print("\n=== Distribution of correct_answers per model & split (percent) ===")
        print(grp_pivot_ca.round(2).to_string(index=False))
else:
    print("\nNo correct_answers column found in data.")


In [None]:
# --- Combined table: picked (qa_result) vs reference (correct_answers) ---
if all_df.empty:
    print("\nNo data loaded.")
else:
    if "correct_answers" not in all_df.columns:
        print("\nNo correct_answers column found in data.")
    else:
        # Parse correct_answers into a list of A/B/C (non-A/B/C are dropped)
        def parse_correct(cell):
            if pd.isna(cell):
                return None
            try:
                parsed = ast.literal_eval(cell) if isinstance(cell, str) else cell
            except Exception:
                return None
            if isinstance(parsed, (list, tuple)):
                return [str(x).strip() for x in parsed if str(x).strip() in VALID_CHOICES]
            if isinstance(parsed, str):
                s = parsed.strip()
                return [s] if s in VALID_CHOICES else None
            return None

        df = all_df.copy()
        df["correct_answers_parsed"] = df["correct_answers"].map(parse_correct)

        # Helper to compute % table for a given column (already list-like)
        def percent_table(df_in, colname):
            exploded = df_in.explode(colname)
            exploded = exploded[exploded[colname].isin(VALID_CHOICES)]
            if exploded.empty:
                # Return empty with required structure so merges work
                out = pd.DataFrame(columns=["model", "split", "A", "B", "C"])
                return out
            counts = (
                exploded.groupby(["model", "split", colname])
                .size()
                .rename("n")
                .reset_index()
            )
            counts["total"] = counts.groupby(["model", "split"])["n"].transform("sum")
            counts["percent"] = counts["n"] / counts["total"] * 100
            pivot = counts.pivot_table(
                index=["model", "split"],
                columns=colname,
                values="percent",
                fill_value=0,
                aggfunc="sum",
            ).reset_index()
            # Ensure A/B/C columns present and ordered
            for c in ["A", "B", "C"]:
                if c not in pivot.columns:
                    pivot[c] = 0.0
            return pivot[["model", "split", "A", "B", "C"]]

        # Use only correctly parsed qa_result rows for "picked"
        picked_base = all_df.loc[~all_df["wrong_format"]].copy()
        picked_pct = percent_table(picked_base, "qa_result_parsed")
        ref_pct = percent_table(df, "correct_answers_parsed")

        # Merge and rename columns
        combined = picked_pct.merge(ref_pct, on=["model", "split"], how="outer", suffixes=("_picked", "_ref"))
        # Order columns: model, split, A_picked, A_ref, B_picked, B_ref, C_picked, C_ref
        combined = combined[[
            "model", "split",
            "A_picked", "A_ref",
            "B_picked", "B_ref",
            "C_picked", "C_ref",
        ]].fillna(0.0)

        print("\n=== Picked vs Reference distribution per model & split (percent) ===")
        print(combined.round(2).to_string(index=False))


In [None]:
latex_str = combined.round(2).to_latex(
    index=False,          # don't print row indices
    escape=False,         # allow special chars like model names with ()
    column_format="l l r r r r r r",  # adjust alignment: l=left, r=right
    caption="Picked vs Reference distribution per model & split",
    label="tab:picked_ref_distribution"
)

print("\n=== LaTeX table ===\n")
print(latex_str)