In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import json
from collections import Counter

base_dir = os.path.abspath(os.path.join(os.getcwd(), "../../../"))
sys.path.append(os.path.join(base_dir, "Data_Augmentation"))

from llm import llm_response

In [3]:
full_df = pd.read_json('../informativeness_output/full_df_with_answerability.json')
org_df = pd.read_json('../informativeness_output/org_df_with_answerability.json')
gpt_df = pd.read_json('../informativeness_output/gpt_df_with_answerability.json')

ERROR_MSG = "LLM failed to generate a response"

idx_with_incorrect_eval = []
example_generated_follow_up = []

def analyze_informativeness(df):
    counter = Counter()
    
    for idx, data in df.iterrows():
        follow_up_answerability = data["generated_follow_up_answerability"].replace("“", '"').replace("”", '"').replace("'", '"')

        if follow_up_answerability == ERROR_MSG: continue

        nested_list = json.loads(follow_up_answerability)

        if len(nested_list) != len(data["generated_follow_up"]):
            idx_with_incorrect_eval.append(idx)
            example_generated_follow_up.append(data)

        if ('Complete Answer','Original Answer') in nested_list:
            print(idx)

        counter.update(tuple(sublist) for sublist in nested_list)

    return counter

full_result = analyze_informativeness(full_df)
org_result = analyze_informativeness(org_df)
gpt_result = analyze_informativeness(gpt_df)
print(idx_with_incorrect_eval)
    

[]


In [4]:
full_result

Counter({('Complete Answer',): 712,
         (): 676,
         ('Original Answer', 'Complete Answer'): 397,
         ('Original Answer',): 255})

In [5]:
org_result

Counter({('Original Answer', 'Complete Answer'): 610,
         (): 592,
         ('Original Answer',): 582,
         ('Complete Answer',): 542,
         ('Complete Answer', 'Original Answer'): 1})

In [6]:
# Add the count of ('Complete Answer', 'Original Answer') to ('Original Answer', 'Complete Answer')
org_result[('Original Answer', 'Complete Answer')] += org_result[('Complete Answer', 'Original Answer')]

# Remove the ('Complete Answer', 'Original Answer') entry
del org_result[('Complete Answer', 'Original Answer')]

org_result

Counter({('Original Answer', 'Complete Answer'): 611,
         (): 592,
         ('Original Answer',): 582,
         ('Complete Answer',): 542})

In [7]:
gpt_result

Counter({('Complete Answer',): 674,
         (): 624,
         ('Original Answer', 'Complete Answer'): 400,
         ('Original Answer',): 179})

In [8]:
template_table = pd.DataFrame([
    ["Generated FollowupQ", "Answered by CA", "Unanswered by CA"],
    ["Answered by OA", "No new information", "Inappropriate CA"],
    ["Unanswered by OA", "New Information", "Unrelated followupQ"]
])

def display_result(df, table):
    count = 0

    for value in df.values():
        count += value

    for key, value in df.items():
        percentage = round((value / count), 2)
        match key:
            case ():
                table.iloc[2,2] = percentage
            case ('Complete Answer',):
                table.iloc[2,1] = percentage
            case ('Original Answer',):
                table.iloc[1,2] = percentage
            case ('Original Answer', 'Complete Answer'):
                table.iloc[1,1] = percentage
    
    return table

full_result_table = display_result(full_result, template_table.copy())
org_result_table = display_result(org_result, template_table.copy())
gpt_result_table = display_result(gpt_result, template_table.copy())

In [9]:
full_result_table

Unnamed: 0,0,1,2
0,Generated FollowupQ,Answered by CA,Unanswered by CA
1,Answered by OA,0.19,0.12
2,Unanswered by OA,0.35,0.33


In [10]:
org_result_table

Unnamed: 0,0,1,2
0,Generated FollowupQ,Answered by CA,Unanswered by CA
1,Answered by OA,0.26,0.25
2,Unanswered by OA,0.23,0.25


In [11]:
gpt_result_table

Unnamed: 0,0,1,2
0,Generated FollowupQ,Answered by CA,Unanswered by CA
1,Answered by OA,0.21,0.1
2,Unanswered by OA,0.36,0.33
