In [1]:
# !pip install "google-cloud-aiplatform>=1.38" vertexai
#TODO: gcloud auth application-default login to authenticate

In [2]:
import vertexai
from vertexai.generative_models import GenerativeModel

# TODO(developer): Update and un-comment below line
project_id = "gvt0031-gcp-152-govtext-ds"

vertexai.init(project=project_id, location="asia-southeast1")

model = GenerativeModel(model_name="gemini-1.5-flash-001")

## Preprocess input

In [4]:
llama_predictions_path = "/home/watson_chua/efs/axolotl/data/predictions/hansard/llama3_8b_lora.jsonl"
gemma_predictions_path = "/home/watson_chua/efs/axolotl/data/predictions/hansard/gemma2_9b_lora.jsonl"
gpt4_predictions_path = "/home/watson_chua/efs/axolotl/data/input_data/data/gpt4_answers_by_hy_doc.jsonl"

In [5]:
import json
with open(llama_predictions_path, 'r') as f:
    llama_lines = f.readlines()

with open(gemma_predictions_path, 'r') as f:
    gemma_lines = f.readlines()

with open(gpt4_predictions_path, 'r') as f:
    gpt4_lines = f.readlines()

llama_lines = [json.loads(l) for l in llama_lines]
gemma_lines = [json.loads(l) for l in gemma_lines]
gpt4_lines = [json.loads(l) for l in gpt4_lines]

In [6]:
len(llama_lines), len(gemma_lines), len(gpt4_lines)

(304, 304, 343)

In [7]:
llama_lines[0]

{'generation': 'As at end-January 2024, there were 101,464 persons who did not vote in the last Presidential election but have since applied to restore their names to the Registers of Electors. All such applications have been approved after being verified by the Elections Department.',
 'input': "<|start_header_id|>system<|end_header_id|>You are a public servant. Your task is to reply to a parliamentary question given a list of supporting points.<|eot_id|><|start_header_id|>user<|end_header_id|>\nQuestion:Mr Leong Mun Wai asked the Prime Minister (a) since the conclusion of the 2023 Presidential Election, how many non-voters have applied to restore their names to the Registers of Electors; and (b) how many of these applications are successful.\n\nSupporting points: **Title: Applications to Restore Names to Registers of Electors**\n\n**Executive Summary:**\n\nAs of the end of January 2024, a total of 101,464 non-voters from the 2023 Presidential Election have successfully applied to res

In [8]:
gpt4_lines[0]['question']

'Mr Leong Mun Wai asked the Prime Minister (a) since the conclusion of the 2023 Presidential Election, how many non-voters have applied to restore their names to the Registers of Electors; and (b) how many of these applications are successful.'

In [9]:
llama_lines[0]['input'].split("<|end_header_id|>\nQuestion:", maxsplit=1)[-1].split("\n\nSupporting points:")[0]

'Mr Leong Mun Wai asked the Prime Minister (a) since the conclusion of the 2023 Presidential Election, how many non-voters have applied to restore their names to the Registers of Electors; and (b) how many of these applications are successful.'

In [10]:
llama_questions = [q['input'].split("<|end_header_id|>\nQuestion:", maxsplit=1)[-1].split("\n\nSupporting points:")[0] for q in llama_lines]

In [11]:
for i,l in enumerate(gpt4_lines):
    if l['question'] not in llama_questions:
        print(i, l['question'])

In [12]:
import pandas as pd 
df_gpt4 = pd.DataFrame(gpt4_lines)

In [13]:
df_duplicated = df_gpt4[df_gpt4['question'].duplicated()]

In [14]:
df_gpt4_deduplicated = df_gpt4.drop_duplicates('question')

In [15]:
df_gpt4_deduplicated.head()

Unnamed: 0,status,reference_answer,summary_points,points,title,subtitle,question_speaker,answer_speaker,question,answer,filename,hypothetical_document,date,gpt4_answer_by_hy_doc
0,answered,,"[101,464 non-voters from the 2023 Presidential...","101,464 non-voters from the 2023 Presidential ...",Applications to Restore Names to Registers of ...,,Mr Leong Mun Wai,Mr Chan Chun Sing (for the Prime Minister),Mr Leong Mun Wai asked the Prime Minister (a) ...,Mr Chan Chun Sing (for the Prime Minister): As...,sitting_2024-02-29,**Title: Applications to Restore Names to Regi...,2024-02-29 00:00:00,**Reply:**\n\n**Title: Applications to Restore...
1,answered,,"[From 2021 to 2023, over 23,300 companies obta...","From 2021 to 2023, over 23,300 companies obtai...",Applications under Enterprise Financing Scheme,,Mr Liang Eng Hwa,Mr Gan Kim Yong,Mr Liang Eng Hwa asked the Minister for Trade ...,"Mr Gan Kim Yong: From 2021 to 2023, over 23,30...",sitting_2024-02-29,**Title: Applications under Enterprise Financi...,2024-02-29 00:00:00,**Title: Applications under Enterprise Financi...
2,answered,,"[About 36,400 companies have received support ...","About 36,400 companies have received support u...",Applications for Productivity Solutions Grant ...,,Mr Liang Eng Hwa,Mr Gan Kim Yong,Mr Liang Eng Hwa asked the Minister for Trade ...,"Mr Gan Kim Yong: In the last three years, abou...",sitting_2024-02-29,### Applications for Productivity Solutions Gr...,2024-02-29 00:00:00,**Reply to Parliamentary Question on Applicati...
3,answered,,[Schools have autonomy to decide the end times...,Schools have autonomy to decide the end times ...,Time Guidelines for Schools to Cease Co-curric...,,Mr Leong Mun Wai,Mr Chan Chun Sing,Mr Leong Mun Wai asked the Minister for Educat...,Mr Chan Chun Sing: Schools have the autonomy t...,sitting_2024-02-29,**Title: Time Guidelines for Schools to Cease ...,2024-02-29 00:00:00,**Reply to Parliamentary Question on Time Guid...
4,answered,,[Rail operators have improved track design and...,Rail operators have improved track design and ...,Mitigation Measures to Reduce Noise from Railw...,,Mr Leong Mun Wai,Mr Chee Hong Tat,Mr Leong Mun Wai asked the Minister for Transp...,"Over the years, rail operators have improved t...",sitting_2024-02-29,**Report: Mitigation Measures to Reduce Noise ...,2024-02-29 00:00:00,**Reply to Parliamentary Question on Mitigatio...


In [16]:
for i, (index, row) in enumerate(df_gpt4_deduplicated.iterrows()):
    if row['question'] != llama_questions[i]:
        print(row['question'])

## Evaluation

In [17]:
prompt_three_way_template = """Given the following question, context, and three different answers a), b) and c), assess the answer based on the following criteria: 
 1) factual correctness according to the context
 2) similarity to model answer
 3) conciseness
 
Respond in JSON whether answer a), b), or c) is the better answer and state your reason using the following schema:

{{"winner": a, b, or c,
"reason": reason it is the better answer}}

Here is the information,

{question}

Context: {context}

Model Answer: {ground_truth}

Answer A: {answer_a}

Answer B: {answer_b}

Answer C: {answer_c}



Respond with only the JSON reply and nothing else.
"""


prompt_two_way_template = """Given the following question, context, and two different answers a) and b), assess the answer based on the following criteria: 
 1) factual correctness according to the context
 2) similarity to model answer
 3) conciseness
 
Respond in JSON whether answer a) or b) is the better answer and state your reason using the following schema:

{{"winner": a or b,
"reason": reason it is the better answer}}

Here is the information,

{question}

Context: {context}

Model Answer: {ground_truth}

Answer A: {answer_a}

Answer B: {answer_b}

Respond with only the JSON reply and nothing else.
"""

In [18]:
from tqdm.auto import tqdm
# three_way_evaluation_results = []
gpt4_llama3_evaluation_results = []
gpt4_gemma2_evaluation_results = []


df_eval = df_gpt4_deduplicated
for i, (index, row) in tqdm(enumerate(df_eval.iterrows()), total=len(df_eval)):
    gpt_4_answer = row['gpt4_answer_by_hy_doc']
    ground_truth = row['answer']
    question = row['question']
    hypothetical_doc = row['hypothetical_document']
    llama3_answer = llama_lines[i]['generation']
    gemma2_answer = gemma_lines[i]['generation']


    # prompt_three_way = prompt_three_way_template.format(context=hypothetical_doc, ground_truth=ground_truth, question=question, answer_a=gpt_4_answer, answer_b=llama3_answer, answer_c=gemma2_answer)
    prompt_gpt4_llama3 = prompt_two_way_template.format(context=hypothetical_doc, ground_truth=ground_truth, question=question, answer_a=gpt_4_answer, answer_b=llama3_answer)
    prompt_gpt4_gemma2 = prompt_two_way_template.format(context=hypothetical_doc, ground_truth=ground_truth, question=question, answer_a=gpt_4_answer, answer_b=gemma2_answer)

    try:
        # response_three_way = model.generate_content(prompt_three_way).text
        response_gpt4_llama3 = model.generate_content(prompt_gpt4_llama3).text
        response_gpt4_gemma2 = model.generate_content(prompt_gpt4_gemma2).text


    except ValueError as e:
        print(e)
        continue
    
    
    # three_way_evaluation_results.append({'gemini_flash_evaluation': response, **row, 'llama3_answer': llama3_answer, 'gemma2_answer': gemma2_answer})
    gpt4_llama3_evaluation_results.append({'gemini_flash_evaluation': response_gpt4_llama3, **row, 'llama3_answer': llama3_answer})
    gpt4_gemma2_evaluation_results.append({'gemini_flash_evaluation': response_gpt4_gemma2, **row, 'gemma2_answer': gemma2_answer})




  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 304/304 [12:16<00:00,  2.42s/it]


In [23]:
# with open('/home/watson_chua/efs/axolotl/data/predictions/gpt4_llama3_gemma2_evaluation.jsonl', 'w') as f:
#     for l in three_way_evaluation_results:
#         f.write(json.dumps(l) + '\n')  

with open('/home/watson_chua/efs/axolotl/data/predictions/gpt4_llama3_evaluation.jsonl', 'w') as f:
    for l in gpt4_llama3_evaluation_results:
        f.write(json.dumps(l) + '\n')  

with open('/home/watson_chua/efs/axolotl/data/predictions/gpt4_gemma2_evaluation.jsonl', 'w') as f:
    for l in gpt4_gemma2_evaluation_results:
        f.write(json.dumps(l) + '\n')  

In [24]:
import re
def extract_json(text):
    if text.startswith("```json"):
        found = re.findall("```json\n(.*)\n```", text)
        try:
            result = json.loads(found[0])
        except IndexError as e:
            print(e, found, text)
    else:
        result = json.loads(text)        
    return result

In [25]:
# three_way_eval_json = [extract_json(er['gemini_flash_evaluation']) for er in three_way_evaluation_results]
# df_three_way_eval_results = pd.DataFrame(three_way_eval_json)
# df_three_way_eval_results.to_csv('/home/watson_chua/efs/axolotl/data/predictions/gpt4_llama3_gemma2_evaluation_outcome.csv', index=False)


gpt4_llama3_eval_json = [extract_json(er['gemini_flash_evaluation']) for er in gpt4_llama3_evaluation_results]
df_gpt4_llama3_eval_results = pd.DataFrame(gpt4_llama3_eval_json)
df_gpt4_llama3_eval_results.to_csv('/home/watson_chua/efs/axolotl/data/predictions/gpt4_llama3_evaluation_outcome.csv', index=False)


gpt4_gemma2_eval_json = [extract_json(er['gemini_flash_evaluation']) for er in gpt4_gemma2_evaluation_results]
df_gpt4_gemma2_eval_results = pd.DataFrame(gpt4_gemma2_eval_json)
df_gpt4_gemma2_eval_results.to_csv('/home/watson_chua/efs/axolotl/data/predictions/gpt4_gemma2_evaluation_outcome.csv', index=False)

df_eval_results['winner'].value_counts()

In [None]:
df_three_way_eval_results['winner'].value_counts()

winner
a    155
b    104
c     44
Name: count, dtype: int64

In [26]:
df_gpt4_llama3_eval_results['winner'].value_counts()

winner
b    238
a     66
Name: count, dtype: int64

In [27]:
df_gpt4_gemma2_eval_results['winner'].value_counts()

winner
b    238
a     66
Name: count, dtype: int64

In [28]:
df_gpt4_llama3_eval_results

Unnamed: 0,winner,reason
0,b,Answer b) is more concise and does not contain...
1,b,Answer b) is more concise and directly answers...
2,b,Answer b) is more concise and focuses on answe...
3,b,Answer b) is more concise and focuses on the k...
4,b,"Answer b) is more concise, while maintaining f..."
...,...,...
299,b,Answer b) is more concise and focuses on answe...
300,b,Answer b) is concise and provides a direct res...
301,b,Answer b) is a better answer because it is mor...
302,a,Answer a) is more comprehensive in addressing ...


In [29]:
df_gpt4_gemma2_eval_results

Unnamed: 0,winner,reason
0,b,Answer b) is more concise and closer to the mo...
1,b,Answer b) is more concise and provides a succi...
2,b,"Answer b) is more concise, directly answers th..."
3,b,Answer b) is more concise and focuses on the k...
4,a,Answer a) is better because it is more detaile...
...,...,...
299,b,"Answer b) is more concise than answer a), whic..."
300,b,Answer b) is better because it is more concise...
301,b,"Answer B is more concise and to the point, dir..."
302,b,Answer b) is more concise and provides a direc...
