## Split data into train and test splits

In [14]:
from pathlib import Path
output_dir_path = Path("/home/watson_chua/efs/hansard_finetuning/data/input_data/")
output_dir_path.mkdir(parents=True, exist_ok=True)    

In [15]:
import json
import pandas as pd

input_path="/home/watson_chua/efs/hansard_finetuning/data/input_data/written_question_answers_hy_doc.jsonl"


with open(input_path,'r') as f:
    lines = f.readlines()
    
data = [json.loads(l) for l in lines]
df = pd.DataFrame(data)

# only use 2024 data which are answered
df_answered = df[df['status'] == 'answered']
df_answered['date'] = pd.to_datetime(df['filename'].apply(lambda x: x.split('_')[-1]))
df_answered_2024 = df_answered[df_answered.date.dt.year == 2024]
df_answered_others = df_answered[df_answered.date.dt.year != 2024]

In [16]:
len(df_answered_2024), len(df_answered_others)


(304, 1785)

In [17]:
train_output_path = output_dir_path / Path("hansard_answered_questions_train.csv")
test_output_path= output_dir_path / Path("hansard_answered_questions_test.csv")

df_answered_2024.to_csv(test_output_path, index=False)
df_answered_others.to_csv(train_output_path, index=False)

## Format train and test data for LLM training

In [18]:
llama3_prompt_template_with_points = """<|start_header_id|>system<|end_header_id|>You are a public servant. Your task is to reply to a parliamentary question given a list of supporting points.<|eot_id|><|start_header_id|>user<|end_header_id|>
Question:{question}

Supporting points: {points}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>{answer}<|eot_id|><|end_of_text|>"""

In [19]:
from tqdm.auto import tqdm
import re

def format_data(df, output_path, template, include_answer=True):
    with output_path.open('w') as f:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            doc = row.hypothetical_document
            formatted_answer = llama3_prompt_template_with_points.format(question=row.question, points=doc, answer=re.sub("([Mr|Mrs|Mdm|Ms|Dr].*?:)", '', row.answer, count=1).strip())                
            if not include_answer:
                formatted_answer = formatted_answer.rsplit('<|end_header_id|>', maxsplit=1)[0] + '<|end_header_id|>'
            f.write(json.dumps({'input': formatted_answer}) + '\n')
    

In [21]:
format_data(df_answered_others, output_dir_path / Path("hansard_answered_questions_llama3_formatted_train.jsonl"), llama3_prompt_template_with_points, include_answer=True)
format_data(df_answered_2024, output_dir_path / Path("hansard_answered_questions_llama3_formatted_test.jsonl"), llama3_prompt_template_with_points, include_answer=True)
format_data(df_answered_2024, output_dir_path / Path("hansard_answered_questions_llama3_formatted_test_no_response.jsonl"), llama3_prompt_template_with_points, include_answer=False)


  0%|          | 0/1785 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]