In [104]:
import pandas as pd
import numpy as np
import json
import scipy.stats as stats

In [109]:
# Human Evaluation Data
df_human = pd.read_csv("human_eval_all_questions_annotated.csv") # only valid FUQs
df_new_info_human = df_human[["group", "prefix", "newInformation"]].copy()
df_new_info_human["group"] = df_new_info_human["group"].astype(pd.StringDtype())
df_new_info_human["prefix"] = df_new_info_human["prefix"].astype(pd.StringDtype())
df_new_info_human

Unnamed: 0,group,prefix,newInformation
0,full_1,3000_1,0
1,full_1,3000_2,2
2,full_1,3000_3,2
3,full_1,3000_4,2
4,full_1,3001_1,2
...,...,...,...
1123,org_10,3099_3,0
1124,org_10,3099_4,0
1125,org_10,3099_5,0
1126,org_10,3099_6,1


In [110]:
df_new_info_human.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   group           1128 non-null   string
 1   prefix          1128 non-null   string
 2   newInformation  1128 non-null   int64 
dtypes: int64(1), string(2)
memory usage: 26.6 KB


In [111]:
# Automatic Evaluation Data
dfs_auto = {
    "full": pd.read_json('informativeness_output/full_df_with_answerability.json'),
    "org": pd.read_json('informativeness_output/org_df_with_answerability.json'),
    "gpt": pd.read_json('informativeness_output/gpt_df_with_answerability.json')
}

ERROR_MSG = "LLM failed to generate a response"

Unnamed: 0,id,question,answer,follow-up,relation,generated_follow_up,complete_answer,generated_follow_up_answerability
0,3000,ELI5 Do animals tan?,"Animals can get sunburned like we do, pigs for...",But can they tan? Does their body create color...,Related,[What are the primary sources of heat for anim...,"Yes, some animals can tan. Similar to humans, ...","[[], [""Complete Answer""], [""Complete Answer""],..."
1,3001,ELI5 What is GERD?,Gerd (GastroEsophageal Reflux Disease) is a pr...,I c this was the answer I was looking for. I m...,Related,[What are the potential long-term effects of f...,"GERD, or Gastroesophageal Reflux Disease, is a...","[[“Original Answer”, “Complete Answer”], [], [..."
2,3002,eli5 Why didn’t the dwarves fight in the War o...,"They did, the movies skip a lot of side detail...",Thanks. I’m listening to the audiobooks and to...,Related,[What are some examples of the side conflicts ...,The dwarves did not actively participate in th...,"[[“Original Answer”], [“Complete Answer”], []]"
3,3003,ELI5: How does our voice turn into code / get ...,In a nutshell: The sound waves from your voic...,"Hey, I totally respect and appreciate this. Bu...",Related,[What is the role of the human ear in interpre...,"When you speak into a phone, your voice genera...","[[], [""Original Answer"", ""Complete Answer""], [..."
4,3004,eli5 how is the basic of minecraft. game....li...,"The endgame is to get to ""the end"" where there...",Wasn't the End dimension only added to minecra...,Slightly Related,"[Can you explain what ""sandbox game"" means in ...",Minecraft is an open-world sandbox game where ...,"[[""Original Answer"", ""Complete Answer""], [""Com..."


In [120]:
follow_up_scores = {
    "full": {
        "informative":[],
        "not_informative":[]
    },
    "gpt": {
        "informative":[],
        "not_informative":[]
    },
    "org": {
        "informative":[],
        "not_informative":[]
    },
}

informative_follow_up_scores = []
not_informative_follow_up_scores = []
ca = "complete answer"
oa = "original answer"
matches = 0

for df_name, df in dfs_auto.items():
    for _, data in df.iterrows():
        id = data["id"]
        follow_up_idx = 1
        follow_up_answerability = data["generated_follow_up_answerability"].replace("“", '"').replace("”", '"').replace("'", '"')

        if follow_up_answerability == ERROR_MSG: continue

        follow_up_answerability = json.loads(follow_up_answerability)
        
        for follow_up in follow_up_answerability:
            matching_row = df_new_info_human[
                (df_new_info_human["prefix"] == f"{id}_{follow_up_idx}") &
                (df_new_info_human["group"].str.startswith(df_name))
            ]

            if not matching_row.empty:
                matches+=1
                if follow_up and follow_up[0].lower() == ca:
                    follow_up_scores[df_name]["informative"].extend(matching_row["newInformation"].tolist())

                    # informative_follow_up_scores.extend(matching_row["newInformation"].tolist())
                else:
                    follow_up_scores[df_name]["not_informative"].extend(matching_row["newInformation"].tolist())
                    # not_informative_follow_up_scores.extend(matching_row["newInformation"].tolist())
                    
            follow_up_idx+=1
            
print(matches)
# len(informative_follow_up_scores + not_informative_follow_up_scores)

366


In [125]:
follow_up_scores["org"]["not_informative"]

[0,
 0,
 0,
 1,
 2,
 0,
 1,
 1,
 0,
 2,
 2,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 1,
 1,
 3,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 2,
 2,
 3,
 1,
 1,
 0,
 2,
 2,
 3,
 1,
 1,
 3,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 2,
 1,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 3,
 0,
 1,
 3,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 1,
 0,
 2,
 3,


In [130]:
def informative_score_per_model(df_name):
    data = {
        "Category": ["Informative Follow-Up", "Not Informative Follow-Up"],
        "Mean": [np.mean(follow_up_scores[df_name]["informative"]), np.mean(follow_up_scores[df_name]["not_informative"])],
        "Variance": [np.var(follow_up_scores[df_name]["informative"]), np.var(follow_up_scores[df_name]["not_informative"])]
    }

    # Create DataFrame
    df_stats = pd.DataFrame(data)
    df_stats
    print(df_stats)
    return df

models = ["full", "gpt", "org"]

print("Mean/Var of values obtained from Human Labelling for \nQuestions classified as Informative/Not Informative by GPT-4o")

for model in models:
    print(model.upper())
    df_stats = informative_score_per_model(model)

Mean/Var of values obtained from Human Labelling for 
Questions classified as Informative/Not Informative by GPT-4o
FULL
                    Category      Mean  Variance
0      Informative Follow-Up  1.421053  0.699908
1  Not Informative Follow-Up  1.237500  0.881094
GPT
                    Category      Mean  Variance
0      Informative Follow-Up  1.578947  0.998153
1  Not Informative Follow-Up  1.398010  1.194822
ORG
                    Category      Mean  Variance
0      Informative Follow-Up  0.861111  1.064043
1  Not Informative Follow-Up  0.753894  0.926971


In [113]:
print("Mean/Var of values obtained from Human Labelling for \nQuestions classified as Informative/Not Informative by GPT-4o")
data = {
    "Category": ["Informative Follow-Up", "Not Informative Follow-Up"],
    "Mean": [np.mean(informative_follow_up_scores), np.mean(not_informative_follow_up_scores)],
    "Variance": [np.var(informative_follow_up_scores), np.var(not_informative_follow_up_scores)]
}

# Create DataFrame
df_stats = pd.DataFrame(data)
df_stats

Mean/Var of values obtained from Human Labelling for 
Questions classified as Informative/Not Informative by GPT-4o


Unnamed: 0,Category,Mean,Variance
0,Informative Follow-Up,1.294643,1.0114
1,Not Informative Follow-Up,1.076115,1.062448


In [114]:
# Perform independent t-test
t_stat, p_value = stats.ttest_ind(informative_follow_up_scores, not_informative_follow_up_scores, equal_var=False)

# Print results
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

T-Statistic: 3.2887
P-Value: 0.0011


In [117]:
import numpy as np

mean_diff = np.mean(informative_follow_up_scores) - np.mean(not_informative_follow_up_scores)
pooled_std = np.sqrt((np.var(informative_follow_up_scores) + np.var(not_informative_follow_up_scores)) / 2)
cohen_d = mean_diff / pooled_std

print(f"Cohen's d: {cohen_d:.4f}")

Cohen's d: 0.2146
