In [1]:
import pandas as pd

In [9]:
result_csv_path = "../fpdar/freshqa/gpt-3.5-turbo_fpdar.csv"

df = pd.read_csv(result_csv_path)

# print(df.info())

In [10]:
# total number of instances in each category
total_no = 500
total_fp = 124
total_tp = 376
total_fp_before22 = 91
total_tp_fast = 127
total_tp_slow = 125
total_tp_never = 124
total_tp_before22 = 140
total_tp_after2022 = 236
total_tp_single_hop = 280
total_tp_multi_hop = 96

In [11]:
# total accuracy
total_accuracy = df[df['final_accuracy'] == 'Correct'].shape[0]

In [12]:
# Filter rows where false_premise is True i.e there is a false premise
fp_filtered_df = df[df['premise']]

# Calculate accuracy when false_premise is True
accuracy_fp = fp_filtered_df[fp_filtered_df['final_accuracy'] == 'Correct'].shape[0]

# Calculate accuracy when false_premise is False and the effective years
accuracy_fp_year_before22 = fp_filtered_df[(fp_filtered_df['final_accuracy'] == 'Correct') & 
                            (fp_filtered_df['effective_year'] == "before 2022")].shape[0]

In [13]:
# Filter rows where false_premise is False i.e there is a true premise
tp_filtered_df = df[~df['premise']]

# Calculate accuracy when false_premise is False
accuracy_tp = tp_filtered_df[tp_filtered_df['final_accuracy'] == 
                             'Correct'].shape[0]

In [14]:
# Calculate accuracy when false_premise is False and the various fact type
accuracy_tp_fact_fast = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
    'Correct') & (tp_filtered_df['fact_type'] == "fast-changing")].shape[0]

accuracy_tp_fact_slow = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
    'Correct') & (tp_filtered_df['fact_type'] == "slow-changing")].shape[0]

accuracy_tp_fact_never = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
    'Correct') & (tp_filtered_df['fact_type'] == "never-changing")].shape[0]

In [15]:
# Calculate accuracy when false_premise is False and the effective years
accuracy_tp_year_before22 = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
   'Correct') & (tp_filtered_df['effective_year'] == "before 2022")].shape[0]

accuracy_tp_year_after22 = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
    'Correct') & (tp_filtered_df['effective_year'] != "before 2022")].shape[0]

In [16]:
# Calculate accuracy when false_premise is False and the question hops
accuracy_tp_single_hop = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
        'Correct') & (tp_filtered_df['num_hops'] == "one-hop")].shape[0]

accuracy_tp_multi_hop = tp_filtered_df[(tp_filtered_df['final_accuracy'] == 
        'Correct') & (tp_filtered_df['num_hops'] == "multi-hop")].shape[0]

In [17]:
# print raw results for overall, fp and tp accuracy

print("Total Correct: {}, FP Correct: {}, TP Correct: {}".format(total_accuracy,accuracy_fp,accuracy_tp))

Total Correct: 320, FP Correct: 68, TP Correct: 252


In [18]:
# print raw results for fine grain evaluation

print("Total Correct: {}\nFP Correct: {}\nFP Before 2022: {}\nTP Correct: \
      {}\nTP Fast: {}\nTP Slow: {}\nTP Never: {}\nTP Before 2022: {}\nTP After \
      2022: {}\nTP Single Hop: {}\nTP Multi Hop: {}".format( 
    total_accuracy, accuracy_fp, accuracy_fp_year_before22, accuracy_tp,
    accuracy_tp_fact_fast, accuracy_tp_fact_slow, accuracy_tp_fact_never,
    accuracy_tp_year_before22, accuracy_tp_year_after22,
    accuracy_tp_single_hop, accuracy_tp_multi_hop))

Total Correct: 320
FP Correct: 68
FP Before 2022: 57
TP Correct:       252
TP Fast: 51
TP Slow: 85
TP Never: 116
TP Before 2022: 124
TP After       2022: 128
TP Single Hop: 201
TP Multi Hop: 51


In [19]:
# Accuracy values as percentage
total_accuracy_pr = round(total_accuracy / total_no * 100, 1)
accuracy_fp_pr = round(accuracy_fp / total_fp * 100, 1)
accuracy_fp_year_before22_pr = round(accuracy_fp_year_before22 / total_fp_before22 * 100, 1)
accuracy_tp_pr = round(accuracy_tp / total_tp * 100, 1)
accuracy_tp_fact_fast_pr = round(accuracy_tp_fact_fast / total_tp_fast * 100, 1)
accuracy_tp_fact_slow_pr = round(accuracy_tp_fact_slow / total_tp_slow * 100, 1)
accuracy_tp_fact_never_pr = round(accuracy_tp_fact_never / total_tp_never * 100, 1)
accuracy_tp_year_before22_pr = round(accuracy_tp_year_before22 / total_tp_before22 * 100, 1)
accuracy_tp_year_after22_pr = round(accuracy_tp_year_after22 / total_tp_after2022 * 100, 1)
accuracy_tp_single_hop_pr = round(accuracy_tp_single_hop / total_tp_single_hop * 100, 1)
accuracy_tp_multi_hop_pr = round(accuracy_tp_multi_hop / total_tp_multi_hop * 100, 1)

In [20]:
# Print the rounded values
print("Total Accuracy (%):", total_accuracy_pr)
print("FP Accuracy (%):", accuracy_fp_pr)
print("FP Before 2022 Accuracy (%):", accuracy_fp_year_before22_pr)
print("TP Accuracy (%):", accuracy_tp_pr)
print("TP Fast Fact Accuracy (%):", accuracy_tp_fact_fast_pr)
print("TP Slow Fact Accuracy (%):", accuracy_tp_fact_slow_pr)
print("TP Never Fact Accuracy (%):", accuracy_tp_fact_never_pr)
print("TP Before 2022 Accuracy (%):", accuracy_tp_year_before22_pr)
print("TP After 2022 Accuracy (%):", accuracy_tp_year_after22_pr)
print("TP Single Hop Accuracy (%):", accuracy_tp_single_hop_pr)
print("TP Multi Hop Accuracy (%):", accuracy_tp_multi_hop_pr)

Total Accuracy (%): 64.0
FP Accuracy (%): 54.8
FP Before 2022 Accuracy (%): 62.6
TP Accuracy (%): 67.0
TP Fast Fact Accuracy (%): 40.2
TP Slow Fact Accuracy (%): 68.0
TP Never Fact Accuracy (%): 93.5
TP Before 2022 Accuracy (%): 88.6
TP After 2022 Accuracy (%): 54.2
TP Single Hop Accuracy (%): 71.8
TP Multi Hop Accuracy (%): 53.1
