In [None]:
import pandas as pd

In [None]:
DIMENSIONS = ["correctness_topical", "coherence_logical", "coherence_stylistic", "coverage_broad", "coverage_deep", "consistency_internal", "quality_overall"]

In [None]:
df = (
    pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
    .loc[:, ["query_id", "response_a", "response_b"] + [d+"_gold" for d in DIMENSIONS]]
    .rename({d+"_gold": d for d in DIMENSIONS}, axis=1)
)

In [None]:
from src.aggregation import BradleyTerryAggregator
from tqdm import tqdm

preference_data = []

for qid in tqdm(df.query_id.unique()):
    score_df = []
    for dim in DIMENSIONS:
        comparisons = (
            df
            .loc[df.query_id == qid, ["response_a", "response_b"] + [dim]]
            .assign(
                id_a=lambda df: df.apply(lambda row: row["response_a"] if row[dim] in ["a", "n"] else row["response_b"], axis=1),
                id_b=lambda df: df.apply(lambda row: row["response_b"] if row[dim] in ["a", "n"] else row["response_a"], axis=1),
                tie=lambda df: df[dim] == "n"
            
            )
            .loc[:, ["id_a", "id_b", "tie"]]
        )
        scores = (
            BradleyTerryAggregator(normalize_scores=True)(comparisons)
            .sort_values("score", ascending=True)
            .reset_index(drop=True)
            .reset_index()
            .assign(score=lambda df: df["index"] + 1)
            .drop(columns="index")
        )
        score_df.append(scores.set_index("docno").rename(columns={"score": dim}))
        
    preference_data.append(pd.concat(score_df, axis=1))

preference_data = pd.concat(preference_data, axis=0)
preference_data

In [None]:
df_prefs = (
    preference_data
    .reset_index()
    .merge(
        (
		    pd.read_json("../data/artifacts/responses.jsonl.gz", lines=True)
		    .loc[:, ["response", "style", "kind"]]
		),
        left_on="docno",
        right_on="response",
        how="left"
    )
    .drop(columns=["response"])
    .set_index("docno")
)

df_prefs

In [None]:
(
    df_prefs
    .drop(columns=["style"])
    .groupby(["kind"])
    .mean()
    .reset_index()
    .melt(id_vars=["kind"])
    .pivot(index="variable", columns=["kind"], values="value")
    .round(2)
)

In [None]:
(
    df_prefs
    .groupby(["style", "kind"])
    .agg(["mean", "std"])
    .transpose()
    .unstack()
    .round(2)
)

In [None]:
(   
    pd.concat([
        (
            df_prefs
            .groupby(["style", "kind"])
            .agg(["mean"])
            .transpose()
            .mean(axis=0)
        ),
        (
            df_prefs
            .groupby(["style", "kind"])
            .agg(["std"])
            .transpose()
            .mean(axis=0)
        )
    ], axis=1)
    .rename(columns={0: "mean", 1: "std"})
    .round(2)
)

In [None]:
(
    df_prefs
    .drop(columns=["style"])
    .groupby(["kind"])
    .agg(["mean", "std"])
    .transpose()
    .unstack()
    .round(2)
)

In [None]:
(
    df_prefs
    .drop(columns=["style"])
    .groupby(["kind"])
    .agg(["mean", "std"])
    .transpose()
    .unstack()
    .mean(axis=0)
    .round(2)
)

In [None]:
#df_prefs.reset_index().rename(columns={"docno": "response"}).drop(columns=["style", "kind"]).to_json("../data/artifacts/grades.jsonl.gz", lines=True, orient='records', compression='gzip')

In [None]:
from scipy.stats import wilcoxon
from scipy.stats import false_discovery_control

p_values = {}
for dim in DIMENSIONS:
    for style in ["bullet", "essay", "news"]:
        p = wilcoxon(
            x=(
                df_prefs.query(f"(kind == 'human') & (style == '{style}')")
                .drop(columns=["style", "kind"])
                .loc[:, dim]
                .values
            ), 
            y=(
                df_prefs.query(f"(kind == 'llm')  & (style == '{style}')")
                .drop(columns=["style", "kind"])
                .loc[:, dim]
                .values
            ), 
            zero_method='wilcox', 
            correction=False, 
            alternative='two-sided', 
            method='auto'
        ).pvalue
        p_values[(dim, style)] = (float(p), )

df_pvalues = (
    pd.DataFrame(p_values)
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "dim", "level_1": "style", 0: "pvalue"})
    #.pivot(index="level_0", columns="level_1", values="sig")
    .assign(pvalue=lambda df: false_discovery_control(df["pvalue"], method="bh"))
    .pivot(index="dim", columns="style", values="pvalue")
    < 0.05
)
df_pvalues

In [None]:
from scipy.stats import wilcoxon
from scipy.stats import false_discovery_control

p_values = {}
for dim in DIMENSIONS:
    p = wilcoxon(
        x=(
            df_prefs.query(f"(kind == 'human') & (style == '{style}')")
            .drop(columns=["style", "kind"])
            .loc[:, dim]
            .values
        ), 
        y=(
            df_prefs.query(f"(kind == 'llm')  & (style == '{style}')")
            .drop(columns=["style", "kind"])
            .loc[:, dim]
            .values
        ), 
        zero_method='wilcox', 
        correction=False, 
        alternative='two-sided', 
        method='auto'
    ).pvalue
    p_values[dim] = (float(p), )

df_pvalues = (
    pd.DataFrame(p_values)
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "dim", 0: "pvalue"})
    .assign(pvalue=lambda df: false_discovery_control(df["pvalue"], method="bh"))
    .set_index("index")
    < 0.05
)
df_pvalues

In [None]:
from scipy.stats import wilcoxon
from scipy.stats import false_discovery_control
import itertools

p_values = {}
for dim in DIMENSIONS:
    for (style_a, style_b) in itertools.combinations(["bullet", "essay", "news"], 2):    
        p = wilcoxon(
            x=(
                df_prefs.query(f"(style == '{style_a}')")
                .drop(columns=["style", "kind"])
                .loc[:, dim]
                .values
            ), 
            y=(
                df_prefs.query(f"(style == '{style_b}')")
                .drop(columns=["style", "kind"])
                .loc[:, dim]
                .values
            ), 
            zero_method='wilcox', 
            correction=False, 
            alternative='two-sided', 
            method='auto'
        ).pvalue
        p_values[(dim, style_a, style_b)] = (float(p), )

df_pvalues = (
    pd.DataFrame(p_values)
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "dim", 0: "pvalue"})
    .assign(pvalue=lambda df: false_discovery_control(df["pvalue"], method="bh"))
    .pivot(columns=["level_1", "level_2"], index="dim", values="pvalue")
    < 0.05
)
df_pvalues