## Convert Evaluations to JSON

In [1]:
import sys
sys.path.append("../")
from llm_utils.post_processing import replace_inner_quotes, clean_json_output

In [7]:
import json
import pandas as pd


def convert_evaluation_to_df(input_path, evaluation_key):
    with open(input_path, 'r') as f:
        lines = f.readlines()
    json_lines =  [json.loads(l) for l in lines]
    df_eval = pd.DataFrame([clean_json_output(l[evaluation_key]) for l in json_lines])
    return df_eval

### Gemini Eval

In [8]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_normal/gpt4_llama3_evaluation.jsonl", "gemini_flash_evaluation")
df_eval["winner"].value_counts()

winner
b    212
a     92
Name: count, dtype: int64

In [9]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_normal/gpt4_gemma2_evaluation.jsonl", "gemini_flash_evaluation")
df_eval["winner"].value_counts()

winner
b    221
a     83
Name: count, dtype: int64

In [10]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_normal/llama3_gemma2_evaluation.jsonl", "gemini_flash_evaluation")
df_eval["winner"].value_counts()

winner
b    188
a    116
Name: count, dtype: int64

In [11]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_concise/gpt4_llama3_evaluation.jsonl", "gemini_flash_evaluation")
df_eval["winner"].value_counts()

winner
b    192
a    112
Name: count, dtype: int64

In [12]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_concise/gpt4_gemma2_evaluation.jsonl", "gemini_flash_evaluation")
df_eval["winner"].value_counts()

winner
b    205
a     99
Name: count, dtype: int64

### Claude Eval

In [13]:
df_eval = convert_evaluation_to_df("/home/watson_chua/efs/axolotl/data/evaluations/hansard/gpt4_concise/gpt4_gemma2_evaluation_claude.jsonl", "claude_evaluation")
df_eval["winner"].value_counts()

winner
a    212
b     92
Name: count, dtype: int64

## Joint Comparison

In [57]:
import pandas as pd
import json

output_root_dir = "/home/watson_chua/efs/axolotl/data/evaluations/hansard/"
def compare_llms_as_a_judge(subdir, llm1, llm2):
    df_gemini = pd.read_csv(output_root_dir + subdir + "/" + "{llm1}_{llm2}_evaluation_outcome.csv".format(llm1=llm1, llm2=llm2))
    df_comparison = df_gemini.copy()
    
    with open(output_root_dir + subdir + "/" + "{llm1}_{llm2}_evaluation_claude.jsonl".format(llm1=llm1, llm2=llm2), "r") as f:
        lines = f.readlines()
    json_lines = [json.loads(l) for l in lines]
    claude_winner = [json.loads(jl["claude_evaluation"])["winner"].lower().strip() for jl in json_lines]
    print(len(lines),len(json_lines), len(claude_winner))
    df_comparison["claude_winner"] = claude_winner
    df_comparison["agree"] = df_comparison["winner"] == df_comparison["claude_winner"]
    
    print(df_comparison["winner"].value_counts())
    print(df_comparison["claude_winner"].value_counts())
    
    print(df_comparison["agree"].value_counts())
    print(df_comparison[df_comparison["agree"]]["winner"].value_counts())



In [62]:
compare_llms_as_a_judge("gpt4_concise", "gpt4", "llama3")


304 304 304
winner
b    192
a    112
Name: count, dtype: int64
claude_winner
a    204
b    100
Name: count, dtype: int64
agree
True     180
False    124
Name: count, dtype: int64
winner
a    96
b    84
Name: count, dtype: int64


In [63]:
compare_llms_as_a_judge("gpt4_concise", "gpt4", "gemma2")


304 304 304
winner
b    205
a     99
Name: count, dtype: int64
claude_winner
a    212
b     92
Name: count, dtype: int64
agree
True     159
False    145
Name: count, dtype: int64
winner
a    83
b    76
Name: count, dtype: int64


In [50]:
compare_llms_as_a_judge("gpt4_concise", "llama3", "gemma2")


winner
b    218
a     86
Name: count, dtype: int64


In [60]:
print(compare_llms_as_a_judge("gpt4_normal", "gpt4", "llama3"))


304 304 304
winner
b    212
a     92
Name: count, dtype: int64
claude_winner
a    219
b     85
Name: count, dtype: int64
agree
False    153
True     151
Name: count, dtype: int64
winner
a    79
b    72
Name: count, dtype: int64
None


In [59]:
print(compare_llms_as_a_judge("gpt4_normal", "gpt4", "gemma2"))

304 304 304
winner
b    221
a     83
Name: count, dtype: int64
claude_winner
a    235
b     69
Name: count, dtype: int64
agree
False    168
True     136
Name: count, dtype: int64
winner
a    75
b    61
Name: count, dtype: int64
None


In [61]:
print(compare_llms_as_a_judge("gpt4_normal", "llama3", "gemma2"))

304 304 304
winner
b    188
a    116
Name: count, dtype: int64
claude_winner
b    203
a    101
Name: count, dtype: int64
agree
True     197
False    107
Name: count, dtype: int64
winner
b    142
a     55
Name: count, dtype: int64
None


## Compare Ragas

In [52]:
output_root_dir = "/home/watson_chua/efs/axolotl/data/evaluations/hansard/"
def compare_ragas(subdir, llm1):
    df_ragas = pd.read_csv(output_root_dir + subdir + "/" + "{llm1}_ragas_claude.csv".format(llm1=llm1))
    print(df_ragas.dropna(how="any",axis=0,subset=["faithfulness", "answer_relevancy", "answer_correctness"]).mean(numeric_only=True))

In [53]:
compare_ragas("gpt4_concise", "gpt4")

faithfulness          0.840792
answer_relevancy      0.669053
answer_similarity     0.784777
answer_correctness    0.663694
dtype: float64


In [54]:
compare_ragas("gpt4_normal", "gpt4")

faithfulness          0.855116
answer_relevancy      0.662348
answer_similarity     0.785876
answer_correctness    0.653498
dtype: float64


In [56]:
compare_ragas("gpt4_normal", "llama3")

faithfulness          0.862831
answer_relevancy      0.616783
answer_similarity     0.807248
answer_correctness    0.783219
dtype: float64


In [55]:
compare_ragas("gpt4_normal", "gemma2")

faithfulness          0.860389
answer_relevancy      0.609027
answer_similarity     0.801928
answer_correctness    0.782969
dtype: float64


## Get specific examples

In [20]:
import pandas as pd
output_root_dir = "/home/watson_chua/efs/axolotl/data/evaluations/hansard/"
df_gemini = pd.read_csv(output_root_dir + "gpt4_normal" + "/" + "{llm1}_{llm2}_evaluation_outcome.csv".format(llm1="gpt4", llm2="gemma2"))


In [28]:
df_gemini

Unnamed: 0,winner,reason
0,a,Answer a) is factually correct and concisely a...
1,a,Answer a) is more concise and covers all the p...
2,b,Answer b) is slightly better because it is the...
3,a,"Answer a) accurately summarizes the text, high..."
4,a,Answer a) provides a comprehensive response th...
...,...,...
299,a,Answer a) provides a more comprehensive overvi...
300,a,Answer A is more concise and factually correct...
301,b,Answer b) is the closest to the model answer a...
302,b,Answer b) more concisely highlights the lack o...


In [21]:
df_gemini[df_gemini['winner']=='b']

Unnamed: 0,winner,reason
2,b,Answer b) is slightly better because it is the...
5,b,"Answer b) is more concise, factual, and aligns..."
7,b,Answer B is more concise. Answer A rehashes al...
9,b,Answer b) is the best because it is more conci...
10,b,Answer b) is slightly better than answer a) be...
...,...,...
295,b,Both answers are factually correct and answer ...
297,b,Answer b) is more similar to the model answer ...
298,b,Answer b) is a more concise and better represe...
301,b,Answer b) is the closest to the model answer a...


In [29]:
df_gemini.iloc[5]['reason']

"Answer b) is more concise, factual, and aligns with the model answer. While answer a) provides additional details, it doesn't succinctly address the question posed by Ms. Phua Lay Peng, and lacks the clarity and conciseness of the model answer. Answer b) acknowledges the challenges of applying EASE 2.0 to private homes and reaffirms the Ministry's commitment to supporting seniors in all housing types, mirroring the model answer's approach."

In [22]:
df_all_predictions = pd.read_csv(output_root_dir + "gpt4_normal" + '/consolidated_predictions.csv')


In [23]:
df_all_predictions.iloc[5]['gpt4_answer_by_hy_doc']

'The Ministry acknowledges the importance of extending senior-friendly fittings to elderly retirees living in non-HDB estates. However, due to the diverse architectural designs and unique challenges associated with private homes, a one-size-fits-all approach is not feasible. The Ministry is committed to exploring viable strategies to address these challenges, including customised assessments, financial assistance, collaboration with industry experts, pilot programmes, and public awareness initiatives. These efforts aim to ensure that all elderly residents, regardless of their housing type, can access the necessary support to age well.'

In [24]:
df_all_predictions.iloc[5]['gemma2_answer']

'As both HIP and EASE 2.0 improvements are carried out based on common designs across many units within an HDB block, they cannot easily be extended to private homes where there is great diversity in architecture and layout. Nevertheless, my Ministry will continue to look at ways to better enable seniors living in private homes to age well.<|eot_id|'

In [25]:
df_all_predictions.iloc[5]['answer']

"Mr Desmond Lee: For senior-friendly features within homes, the Housing and Development Board's (HDB's) Home Improvement Programme (HIP) and Enhancement for Active Seniors (EASE) 2.0 programme cover a range of enhancements tailored for HDB flats, which are more standardised. In contrast, the designs and layouts of private homes vary greatly. As such, we will continue to study how seniors living in private homes can be better supported to age well within their homes. "

In [30]:
df_all_predictions.iloc[5]['question']

"Ms Denise Phua Lay Peng asked the Minister for National Development whether the senior-friendly fittings under the Housing and Development Board's (HDB's) Enhancement for Active Seniors (EASE) 2.0 programme, which is not applicable to private housing, can be partially or fully extended to eligible elderly retirees living in non-HDB estates."

In [31]:
df_all_predictions.iloc[5]['hypothetical_document']

"**Title: Extension of Senior-friendly Fittings in EASE 2.0 Programme to Elderly Living in Non-HDB Estates**\n\n**Introduction**\n\nThe Housing Development Board (HDB) has long been committed to enhancing the living conditions of Singaporeans, particularly the elderly, through initiatives such as the Home Improvement Programme (HIP) and the Enhancement for Active Seniors (EASE) 2.0. These programmes have been tailored specifically for HDB flats due to their standardised designs, which facilitate the implementation of senior-friendly fittings. However, the diverse architectural landscape of private homes presents unique challenges that require innovative solutions. This report examines the feasibility of extending the EASE 2.0 Programme to elderly residents in non-HDB estates and outlines the Ministry's ongoing efforts to support senior citizens in private residences.\n\n**HDB's Home Improvement Programme (HIP) and EASE 2.0: A Proven Model**\n\nThe HIP and EASE 2.0 initiatives have been