In [24]:
from utils_parallel import exponential_backoff, process_in_parallel

from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv

load_dotenv()


True

In [11]:
DECODED_CSV = "/workspace/ALGOVERSE/yas/yulia/parascopes/src/yulia/outlines/decoded_outputs_chunk_999.csv"

# Hugging Face parquet with the last 100k completions (we only take last 1000)
HF_PQ = "yulia-volkova/parascopes-outlines-data@main/v5.0/data/outlines_009.parquet"

# Output files
OUT_DIR = Path("./eval_outlines"); OUT_DIR.mkdir(exist_ok=True, parents=True)
OUT_JSONL_PRED = OUT_DIR / "rubric_predicted.jsonl"
OUT_JSONL_ORIG = OUT_DIR / "rubric_original.jsonl"
OUT_SUMMARY_CSV = OUT_DIR / "rubric_summary.csv"

In [12]:
REF_COL = "outline_generated"  

# --- Load parquet from HF and take last 1000 ---
ds = load_dataset("parquet", data_files=f"hf://datasets/{HF_PQ}")["train"]
n = len(ds)
tail = ds.select(range(n-1000, n))  # last 1000

ref_df = tail.to_pandas().reset_index(drop=True)
ref_df["index"] = np.arange(1000)  # local 0..999 index within chunk 999

if REF_COL not in ref_df.columns:
    raise RuntimeError(f"Column '{REF_COL}' not found in parquet. Available: {ref_df.columns.tolist()}")

dec_df = pd.read_csv(DECODED_CSV)
# keep only chunk 999 and needed columns
need_cols = {"chunk","index","decoded_original","decoded_predicted"}
missing = need_cols - set(dec_df.columns)
if missing:
    raise RuntimeError(f"Decoded CSV missing columns: {missing}")

dec_df = dec_df[dec_df["chunk"] == 999].copy()
dec_df = dec_df.sort_values("index").reset_index(drop=True)

# --- Merge on 'index' so rows align: original (ref_df[REF_COL]) vs decoded_* ---
merged = ref_df.merge(dec_df[["index","decoded_original","decoded_predicted"]], on="index", how="inner")

print("Merged shape:", merged.shape)
merged.head(3)

Merged shape: (1000, 10)


Unnamed: 0,example_id,dataset_idx,model,completion,outline_generated,reconstructed_text,embedding_id,index,decoded_original,decoded_predicted
0,999000,999000,meta-llama/Llama-3.3-70B-Instruct,**The Sugar Maple: From Forest to Farm**\n\nTh...,Outline:\n1. Sugar Maple Characteristics\n -...,,999000,0,Summary: 1.Sugar maple leaves - valuable chara...,Summary: 1. Design of Sugarcane Branch - 2. Li...
1,999001,999001,meta-llama/Llama-3.3-70B-Instruct,A Legacy of Union: The United Church of Christ...,Outline:\n1. United Church of Christ history\n...,,999001,1,Description: 1. History of the United Christ C...,Summary: 1. Origin - Church of England 2. Foun...
2,999002,999002,meta-llama/Llama-3.3-70B-Instruct,**Discontinuity Proofs with Epsilon-Delta**\n\...,Outline:\n1. Epsilon-Delta Definition\n - Di...,,999002,2,Summary: 1.Epsilon-Delta Definition - Disconti...,Summary: 1. Explanation of deferential consona...


In [13]:
rubric = """
#### 0. Complexity
How complex is the outline text?
0: Trivial (e.g: just says "** Section **")
1. Simple (e.g: "** Section 1: Green Tea **")
2. Some detail (e.g: a short undetailed sentence or two about something)
3. Many details (e.g: a detailed paragraph with specific information)

#### 1. Coherence (Outline-Level)
Does Outline 2 make sense as an outline compared to Outline 1?
0: Completely incoherent (e.g., excessive repetition, nonsensical phrases, strange symbols).
1: Partially coherent, but repetitive or has formatting issues (e.g., repeated key phrases, awkward pauses).
2: Mostly coherent with minor grouping/order issues.
3: Clear, logical, coherent outline structure.

#### 2. Hierarchy / Structure
How well does Outline 2 preserve the hierarchical levels (headings vs sub-bullets)?
0: No recognizable hierarchy; flat or malformed.
1: Basic levels exist but often mis-nested.
2: Mostly correct hierarchy with some mismatches.
3: Hierarchy closely matches with minimal deviations, highly similar structure.

#### 3. Coverage of Key Sections
Do the major sections in Outline 1 appear in Outline 2?
0: Most key sections missing or unrelated.
1: About half of major sections appear.
2: Most sections present; minor omissions.
3: All major sections present (allow synonyms/regrouping).

#### 4. Ordering / Flow
Does the order of major sections and sub-sections follow Outline 1?
0: Largely shuffled or illogical.
1: Partial overlap but frequent swaps.
2: Mostly consistent with minor swaps.
3: Order closely matches.

#### 5. Subject Match
How similar is the subject of Outline 2 to Outline 1?
-1: No subjects to compare.
0: Completely unrelated subjects ("corporate law" vs "particle physics")
1: Vaguely similar field (e.g: "biology" vs "physics" are both sciences)
2: Related general domain or adjacent fields (e.g., "history" vs. "archaeology" or "alternative medicine" vs. "traditional remedies").
3: Same subject (e.g., both discuss "ancient mayans" or "the properties of the human body").
4: Identical focus (e.g., both analyze "ancient mayan architecture").

#### 6. Entities / Key Concepts
How well does Outline 2 preserve entities or technical terms from Outline 1?
-1: No entities to compare.
0: Unrelated entities.
1: Same category but little overlap.
2: Some overlap or synonyms.
3: Most entities/terms preserved.
4: Nearly all preserved.

#### 7. Details
How similar are the details in the response (Outline 2) to the reference (Outline 1)?
-1: Neither outline text has details to compare.
0: Details differ completely (e.g., Outline 1 lists benefits; Outline 2 is generic).
1: Minimal depth (e.g., both mention "anti-inflammatory properties" with no specifics).
2: Moderate depth (e.g., discuss benefits + 1-2 supporting facts).
3: Highly specific details (e.g., "40% reduction in inflammation").

#### 8. Conciseness of Headings
Are headings concise and outline-appropriate (not full sentences)?
0: Often verbose, unclear, or sentence-like.
1: Mixed clarity—some concise, some verbose.
2: Mostly concise, descriptive headings.

#### 9. Identical
Is Outline 2 essentially identical to Outline 1 (ignoring trivial formatting)?
0: Not identical.
1: Identical.
---

JSON output: {
    "reasoning": {complexity, coherence, hierarchy, coverage, ordering, subject, entities, details, conciseness, identical} - each with explanation string
    "scoring":  {Same keys as above} - each with number score
}

Reasoning should be concise, and explicitly state the "name" of the level (such
as for entities: "[reasoning about the texts], Thus: similar category and partial identical entities: thus out of 4, score 3").

If the specific category does not apply to many things (e.g: complexity is 0 or
1), and there is no specifics between either, then by default give full points.
"""



json_schema = {
    "type": "object",
    "properties": {
        "reasoning": {
            "complexity": {"type": "string"},
            "coherence": {"type": "string"},
            "hierarchy": {"type": "string"},
            "coverage": {"type": "string"},
            "ordering": {"type": "string"},
            "subject": {"type": "string"},
            "entities": {"type": "string"},
            "details": {"type": "string"},
            "conciseness": {"type": "string"},
            "identical": {"type": "string"},
        },
        "scoring": {
            "complexity": {"type": "integer", "minimum": 0, "maximum": 3},
            "coherence": {"type": "integer", "minimum": 0, "maximum": 3},
            "hierarchy": {"type": "integer", "minimum": 0, "maximum": 3},
            "coverage": {"type": "integer", "minimum": 0, "maximum": 3},
            "ordering": {"type": "integer", "minimum": 0, "maximum": 3},
            "subject": {"type": "integer", "minimum": -1, "maximum": 4},
            "entities": {"type": "integer", "minimum": -1, "maximum": 4},
            "details": {"type": "integer", "minimum": -1, "maximum": 3},
            "conciseness": {"type": "integer", "minimum": 0, "maximum": 2},
            "identical": {"type": "integer", "minimum": 0, "maximum": 1},
        },
    },
    "required": ["reasoning", "scoring"],
}



In [None]:
import os, json, numpy as np, pandas as pd
from typing import List, Dict, Tuple
from openai import OpenAI
import sys


@exponential_backoff
def ruberic_compare(ref_text: str, comp_text: str):
    """
    Call the LLM to compare two outlines using the rubric and return JSON (as text).
    """
    prompt = (
        f"Using the following rubric, compare the two outlines:\n\n"
        f"Rubric: {rubric}\n\n"
        f"Outline 1 (reference): {ref_text}\n\n"
        f"Outline 2 (candidate): {comp_text}\n\n"
        "The output must be a valid JSON object and nothing else."
    )
    

    client = OpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        base_url=os.getenv("OPENAI_BASE_URL")
    )
    response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": "You are an expert evaluator.\n\n" + prompt
        }],
        temperature=0.3,
        max_tokens=4000,
        response_format={"type": "json_object"},  # enforce JSON
        function_call={"name": "validate_json", "parameters": json_schema}
    )
    # return raw JSON string 
    return response.choices[0].message.content

def get_ruberic_comparison(ref_texts: List[str], comp_texts: List[str], dataset_idxs: List[int], label=None):
    results = []
    for index, (ref_text, comp_text, ds_idx) in enumerate(zip(ref_texts, comp_texts, dataset_idxs)):
        result = ruberic_compare(ref_text, comp_text)
        results.append(result)
        print({
            "index": index,
            "dataset_idx": ds_idx,
            "type": label,
            "reference": ref_text,
            "comparison": comp_text,
            "result": result
        })
    return results

def get_ruberic_parallel(ref_texts: List[str], comp_texts: List[str], dataset_idxs: List[int], label=None, max_workers: int = 20):

    items = list(zip(np.arange(len(ref_texts)), ref_texts, comp_texts, dataset_idxs))
    print(f"Processing {len(items)} comparisons in parallel")

    def get_rubric(item):
        index, ref_text, comp_text, ds_idx = item
        result = ruberic_compare(ref_text, comp_text)
        print({
            "index": index,
            "dataset_idx": ds_idx,
            "label": label,
            "reference": ref_text,
            "comparison": comp_text,
            "result": result
        })
        sys.stdout.flush()
        return result

    results = process_in_parallel(items, get_rubric, max_workers=max_workers)
    return results


def evaluate_outlines_df(
    df_merged: pd.DataFrame,
    ref_col: str = "outline_generated",
    cand_col: str = "decoded_predicted",
    out_csv: str = "outline_eval_with_rubric.csv",
    parallel_workers: int = 20,
) -> pd.DataFrame:
    """
    Runs rubric evaluation on df_merged[ref_col] vs df_merged[cand_col],
    stores raw JSON + parsed scores into columns, and saves a CSV.
    """
    assert ref_col in df_merged.columns and cand_col in df_merged.columns, \
        f"DataFrame must have columns '{ref_col}' and '{cand_col}'"

    ref_texts  = df_merged[ref_col].astype(str).tolist()
    comp_texts = df_merged[cand_col].astype(str).tolist()

    # get JSON strings (one per row)
    raw_results = get_ruberic_parallel(ref_texts, comp_texts, df_merged["dataset_idx"].tolist(), label="probe-decoded", max_workers=parallel_workers)

    # attach raw JSON and parsed scores
    df_out = df_merged.copy()
    df_out["rubric_json"] = raw_results

    # parse and expand scores
    def _safe_load(js):
        try:
            return json.loads(js)
        except Exception:
            return {}

    parsed = [ _safe_load(s) for s in raw_results ]
    # add each score as its own column
    score_keys = list(json_schema["properties"]["scoring"].keys())
    for k in score_keys:
        df_out[f"score_{k}"] = [p.get("scoring", {}).get(k, None) for p in parsed]

    # optional: add a total/average score
    # here we sum available scores (treat missing as NaN and skip)
    df_out["score_sum"] = df_out[[f"score_{k}" for k in score_keys]].apply(
        lambda r: np.nansum([float(x) if x is not None else np.nan for x in r.values]), axis=1
    )
    df_out["score_mean"] = df_out[[f"score_{k}" for k in score_keys]].apply(
        lambda r: np.nanmean([float(x) if x is not None else np.nan for x in r.values]), axis=1
    )

    # save
    df_out.to_csv(out_csv, index=False)
    print(f"Saved rubric evaluation to: {out_csv}")
    return df_out


In [23]:
# df_scored = evaluate_outlines_df(
#     merged,
#     ref_col="outline_generated",
#     cand_col="decoded_predicted",
#     out_csv="outline_eval_chunk999.csv",
#     parallel_workers=20,
# )

# score_cols = [c for c in df_scored.columns if c.startswith("score_")]
# print("Mean scores:")
# print(df_scored[score_cols].mean(numeric_only=True).sort_values(ascending=False))