In [1]:
import pandas as pd
import numpy as np
import json
import scipy.stats as stats

In [2]:
# Human Evaluation Data
df_human = pd.read_csv("../mode_2/human_eval_all_questions_annotated.csv") # only valid FUQs
df_new_info_human = df_human[["group", "prefix", "newInformation"]].copy()
df_new_info_human["group"] = df_new_info_human["group"].astype(pd.StringDtype())
df_new_info_human["prefix"] = df_new_info_human["prefix"].astype(pd.StringDtype())
df_new_info_human

Unnamed: 0,group,prefix,newInformation
0,full_1,3000_1,
1,full_1,3000_2,2.0
2,full_1,3000_3,2.0
3,full_1,3000_4,2.0
4,full_1,3001_1,2.0
...,...,...,...
1123,org_10,3099_3,0.0
1124,org_10,3099_4,0.0
1125,org_10,3099_5,0.0
1126,org_10,3099_6,1.0


In [111]:
# Automatic Evaluation Data
dfs_auto = {
    "full": pd.read_json('informativeness_output/full_df_with_answerability.json'),
    "org": pd.read_json('informativeness_output/org_df_with_answerability.json'),
    "gpt": pd.read_json('informativeness_output/gpt_df_with_answerability.json')
}

ERROR_MSG = "LLM failed to generate a response"

In [120]:
follow_up_scores = {
    "full": {
        "informative":[],
        "not_informative":[]
    },
    "gpt": {
        "informative":[],
        "not_informative":[]
    },
    "org": {
        "informative":[],
        "not_informative":[]
    },
}

informative_follow_up_scores = []
not_informative_follow_up_scores = []
ca = "complete answer"
oa = "original answer"
matches = 0

for df_name, df in dfs_auto.items():
    for _, data in df.iterrows():
        id = data["id"]
        follow_up_idx = 1
        follow_up_answerability = data["generated_follow_up_answerability"].replace("“", '"').replace("”", '"').replace("'", '"')

        if follow_up_answerability == ERROR_MSG: continue

        follow_up_answerability = json.loads(follow_up_answerability)
        
        for follow_up in follow_up_answerability:
            matching_row = df_new_info_human[
                (df_new_info_human["prefix"] == f"{id}_{follow_up_idx}") &
                (df_new_info_human["group"].str.startswith(df_name))
            ]

            if not matching_row.empty:
                matches+=1
                if follow_up and follow_up[0].lower() == ca:
                    follow_up_scores[df_name]["informative"].extend(matching_row["newInformation"].tolist())

                    # informative_follow_up_scores.extend(matching_row["newInformation"].tolist())
                else:
                    follow_up_scores[df_name]["not_informative"].extend(matching_row["newInformation"].tolist())
                    # not_informative_follow_up_scores.extend(matching_row["newInformation"].tolist())
                    
            follow_up_idx+=1
            
print(matches)
# len(informative_follow_up_scores + not_informative_follow_up_scores)

366


In [130]:
def informative_score_per_model(df_name):
    data = {
        "Category": ["Informative Follow-Up", "Not Informative Follow-Up"],
        "Mean": [np.mean(follow_up_scores[df_name]["informative"]), np.mean(follow_up_scores[df_name]["not_informative"])],
        "Variance": [np.var(follow_up_scores[df_name]["informative"]), np.var(follow_up_scores[df_name]["not_informative"])]
    }

    # Create DataFrame
    df_stats = pd.DataFrame(data)
    df_stats
    print(df_stats)
    return df

models = ["full", "gpt", "org"]

print("Mean/Var of values obtained from Human Labelling for \nQuestions classified as Informative/Not Informative by GPT-4o")

for model in models:
    print(model.upper())
    df_stats = informative_score_per_model(model)

Mean/Var of values obtained from Human Labelling for 
Questions classified as Informative/Not Informative by GPT-4o
FULL
                    Category      Mean  Variance
0      Informative Follow-Up  1.421053  0.699908
1  Not Informative Follow-Up  1.237500  0.881094
GPT
                    Category      Mean  Variance
0      Informative Follow-Up  1.578947  0.998153
1  Not Informative Follow-Up  1.398010  1.194822
ORG
                    Category      Mean  Variance
0      Informative Follow-Up  0.861111  1.064043
1  Not Informative Follow-Up  0.753894  0.926971


In [113]:
print("Mean/Var of values obtained from Human Labelling for \nQuestions classified as Informative/Not Informative by GPT-4o")
data = {
    "Category": ["Informative Follow-Up", "Not Informative Follow-Up"],
    "Mean": [np.mean(informative_follow_up_scores), np.mean(not_informative_follow_up_scores)],
    "Variance": [np.var(informative_follow_up_scores), np.var(not_informative_follow_up_scores)]
}

# Create DataFrame
df_stats = pd.DataFrame(data)
df_stats

Mean/Var of values obtained from Human Labelling for 
Questions classified as Informative/Not Informative by GPT-4o


Unnamed: 0,Category,Mean,Variance
0,Informative Follow-Up,1.294643,1.0114
1,Not Informative Follow-Up,1.076115,1.062448


In [114]:
# Perform independent t-test
t_stat, p_value = stats.ttest_ind(informative_follow_up_scores, not_informative_follow_up_scores, equal_var=False)

# Print results
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

T-Statistic: 3.2887
P-Value: 0.0011


In [117]:
import numpy as np

mean_diff = np.mean(informative_follow_up_scores) - np.mean(not_informative_follow_up_scores)
pooled_std = np.sqrt((np.var(informative_follow_up_scores) + np.var(not_informative_follow_up_scores)) / 2)
cohen_d = mean_diff / pooled_std

print(f"Cohen's d: {cohen_d:.4f}")

Cohen's d: 0.2146
