In [36]:
import json
import pandas as pd
from tqdm.auto import tqdm

## Using only points

In [4]:
with open('data/written_question_answers_processed.jsonl','r') as f:
    lines = f.readlines()
    
data = [json.loads(l) for l in lines]
df = pd.DataFrame(data)

df_answered = df[df['status'] == 'answered']
df_answered['date'] = pd.to_datetime(df['filename'].apply(lambda x: x.split('_')[-1]))
df_answered_2024 = df_answered[df_answered.date.dt.year == 2024]
df_answered_before_2024 = df_answered[df_answered.date.dt.year < 2024]    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_answered['date'] = pd.to_datetime(df['filename'].apply(lambda x: x.split('_')[-1]))


## Using hypothetical documents

In [46]:
llama3_prompt_template_with_points = """<|start_header_id|>system<|end_header_id|>You are a public servant. Your task is to reply to a parliamentary question given a list of supporting points.<|eot_id|><|start_header_id|>user<|end_header_id|>
Question:{question}

Supporting points: {points}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>{answer}<|eot_id|>"""

In [29]:
sample = df_answered_2024.iloc[0]
prompt_template.format(question=sample.question, points=sample.points, answer=re.sub("([Mr|Mrs|Mdm|Ms|Dr].*?:)", '', sample.answer, count=1).strip())

'<|start_header_id|>system<|end_header_id|>\nYou are a public servant. Your task is to reply to a parliamentary question given a list of supporting points.<|eot_id|><|start_header_id|>user<|end_header_id|>\nQuestion:Mr Leong Mun Wai asked the Prime Minister (a) since the conclusion of the 2023 Presidential Election, how many non-voters have applied to restore their names to the Registers of Electors; and (b) how many of these applications are successful.\n\nSupporting points: 101,464 non-voters from the 2023 Presidential Election applied to restore their names to the Registers of Electors as of end January 2024.\n\nAll the applications were successful.\n\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>As of end January 2024, 101,464 non-voters from the 2023 Presidential Election applied to restore their names to the Registers of Electors. All the applications were successful.<|eot_id|>\n'

In [44]:
def format_data(df, output_path, template, include_answer=True):
    with open(output_path, 'w') as f:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            formatted_answer = template.format(question=row.question, points=row.points, answer=re.sub("([Mr|Mrs|Mdm|Ms|Dr].*?:)", '', row.answer, count=1).strip())                
            if not include_answer:
                formatted_answer = formatted_answer.rsplit('<|end_header_id|>', maxsplit=1)[0] + '<|end_header_id|>'
            f.write(json.dumps({'input': formatted_answer}) + '\n')
    

In [45]:
format_data(df_answered_2024, 'data/reply_by_points_llama3_formatted_test.jsonl', llama3_prompt_template_with_points, True)
format_data(df_answered_2024, 'data/reply_by_points_llama3_formatted_test_no_response.jsonl', llama3_prompt_template_with_points, False)
format_data(df_answered_before_2024, 'data/reply_by_points_llama3_formatted_train.jsonl', llama3_prompt_template_with_points, True)

  0%|          | 0/304 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]

  0%|          | 0/1785 [00:00<?, ?it/s]

In [51]:
def format_data_alpaca(df, output_path, template):
    with open(output_path, 'w') as f:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            formatted_answer = template.format(question=row.question, points=row.points, answer=re.sub("([Mr|Mrs|Mdm|Ms|Dr].*?:)", '', row.answer, count=1).strip())                
            f.write(json.dumps(
                {"instruction": "You are a public servant. Your task is to reply to a parliamentary question given a list of supporting points.", 
                 "input": "Question: " + row.question + "\n\nSupporting points: " + row.points, 
                 "output": re.sub("([Mr|Mrs|Mdm|Ms|Dr].*?:)", '', row.answer, count=1).strip()}            
            ) + '\n')

In [52]:
format_data_alpaca(df_answered_2024, 'data/reply_by_points_alpaca_formatted_test.jsonl', llama3_prompt_template_with_points)
# format_data_alpaca(df_answered_2024, 'data/reply_by_points_alpaca_formatted_test_no_response.jsonl', llama3_prompt_template_with_points, False)
format_data_alpaca(df_answered_before_2024, 'data/reply_by_points_alpaca_formatted_train.jsonl', llama3_prompt_template_with_points)

  0%|          | 0/304 [00:00<?, ?it/s]

  0%|          | 0/1785 [00:00<?, ?it/s]

In [53]:
from datasets import load_dataset                   

squad = load_dataset("squad", split="train")        
squad.to_json("squad.json")            

data_files = {"train": "squad.json"}
re_squad = load_dataset("json", data_files=data_files, split="train")

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [55]:
alpaca = load_dataset("json", data_files={"train": "data/reply_by_points_alpaca_formatted_train.jsonl"}, split="train")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [56]:
alpaca

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1785
})