In [1]:
import json

In [2]:
with open('docs/sitting_2024-05-08.json', 'r') as f:
    data = json.load(f)

In [12]:
wa_list = [d for d in data['takesSectionVOList'] if d['sectionType'].lower().startswith('wa')]

In [None]:
wa_list

In [15]:
wa_list[0]['content'].split('</p><p><strong>')

['<p>14 <strong>Ms See Jinli Jean</strong> asked the Minister for Education (a) for the past three years, what is the breakdown by sector for the (i) percentage of workforce utilising the Skills Development Fund (SDF) and (ii) average ratio of the SDF utilised to the Skills Development Levy (SDL) collected, for full-time, part-time and casual workers respectively; and (b) how does the Ministry expect the average ratio of the utilisation of the SDF to the SDL for the different sectors and workers to change in view of broad-based technology-intensification.</p><p>15 <strong>Ms See Jinli Jean</strong> asked the Minister for Education in respect of Skills Development Fund usage, whether the Ministry will consider (i) enhancing course fee subsidy level for individual sponsored learners and (ii) lowering the minimum age for them to qualify for course fee subsidy.',
 'Mr Chan Chun Sing</strong>:&nbsp;The Skills Development Fund (SDF) is a consolidated fund that supports local workforce develo

In [41]:
contents = wa_list[0]['content']
from bs4 import BeautifulSoup
soup_list = [BeautifulSoup(c, "html.parser") for c in contents.split('</p><p><strong>')]

In [42]:
[s.text for s in soup_list]

['14 Ms See Jinli Jean asked the Minister for Education (a) for the past three years, what is the breakdown by sector for the (i) percentage of workforce utilising the Skills Development Fund (SDF) and (ii) average ratio of the SDF utilised to the Skills Development Levy (SDL) collected, for full-time, part-time and casual workers respectively; and (b) how does the Ministry expect the average ratio of the utilisation of the SDF to the SDL for the different sectors and workers to change in view of broad-based technology-intensification.15 Ms See Jinli Jean asked the Minister for Education in respect of Skills Development Fund usage, whether the Ministry will consider (i) enhancing course fee subsidy level for individual sponsored learners and (ii) lowering the minimum age for them to qualify for course fee subsidy.',
 "Mr Chan Chun Sing:\xa0The Skills Development Fund (SDF) is a consolidated fund that supports local workforce development. It is made up of the Skills Development Levy (SD

In [23]:
import re

In [43]:
contents.split('<p>')

['',
 '14 <strong>Ms See Jinli Jean</strong> asked the Minister for Education (a) for the past three years, what is the breakdown by sector for the (i) percentage of workforce utilising the Skills Development Fund (SDF) and (ii) average ratio of the SDF utilised to the Skills Development Levy (SDL) collected, for full-time, part-time and casual workers respectively; and (b) how does the Ministry expect the average ratio of the utilisation of the SDF to the SDL for the different sectors and workers to change in view of broad-based technology-intensification.</p>',
 '15 <strong>Ms See Jinli Jean</strong> asked the Minister for Education in respect of Skills Development Fund usage, whether the Ministry will consider (i) enhancing course fee subsidy level for individual sponsored learners and (ii) lowering the minimum age for them to qualify for course fee subsidy.</p>',
 '<strong>Mr Chan Chun Sing</strong>:&nbsp;The Skills Development Fund (SDF) is a consolidated fund that supports local 

In [30]:
re.match(r"<p.*>(.*)</p>", wa_list[0]['content'])

<re.Match object; span=(0, 3270), match='<p>14 <strong>Ms See Jinli Jean</strong> asked th>

In [40]:
parse_content(wa_list[0]['content'])

UnboundLocalError: local variable 'html_text' referenced before assignment

In [205]:
from typing import List
from utils.parse_hansard import HansardInfo, convert_lang, rm_em_italics, rm_strong, preprocess_htmlFullContent
import logging
from pathlib import Path

logger = logging.getLogger('extract-speech')


def convert_to_question_answer_pairs(input_path: str):
    """For 2020 hansard format, where the text is in field takesSectionVOList

    Args:
        data (dict): hansard json read as dictionary
        hansard_info_list (List(HansardInfo)): list of HansardInfo extracted to append to
        filename (str): Name of the hansard file
    Returns:
        hansard_info_list
    """
    
    filename = Path(input_path).stem
    
    
    with open(input_path, 'r') as f:
        data = json.load(f)
    
    all_dialogues = []
    
    for idx, article in enumerate(data['takesSectionVOList']):
        title = article['title']
        subtitle = article['subTitle']
        section = article['sectionType']
        if not section.lower().startswith('wa'):
            continue
            
        # print the before and after processing
        logger.debug("Filename: %s", filename)
        logger.debug("%s\nORIGINAL %s:\n%s\nREMOVED %s:\n%s", title, idx, article['content'], idx , convert_lang(rm_em_italics(rm_strong(preprocess_htmlFullContent(article['content'])))))
        content = BeautifulSoup(convert_lang(rm_em_italics(rm_strong(preprocess_htmlFullContent(article['content'])))), "html.parser")

        # all ps in current article
        iter_content = content.find_all(['p'])
        article_dialogues = []
        
        # tracker for current speaker
        cur_speaker_content = ""
        cur_speaker = ""

        # while iter_content:
        for ic in iter_content:
            speakers = ic.find_all(['strong'])
            if speakers:
                speaker = speakers[0].text #assuming one speaker per para
                content_no_speaker = ic.text[ic.text.index(speaker):]
            
            else:
                speaker = ""
                content_no_speaker = ic.text
                
            
            
            if speaker and speaker != cur_speaker:
                article_dialogues.append({'speaker': cur_speaker, 'content': cur_speaker_content, 'title': title, 'subtitle': subtitle, 'section': section, 'filename': filename})
                
                cur_speaker = speaker
                cur_speaker_content = content_no_speaker
            else:                
                cur_speaker_content += "\n" + content_no_speaker
                
        # add the last one
        article_dialogues.append({'speaker': cur_speaker, 'content': content_no_speaker, 'title': title, 'subtitle': subtitle, 'section': section, 'filename': filename})
    
        all_dialogues.append(article_dialogues[1:])
        
    consolidated_dialogues = []
    for dialogue in all_dialogues:
        if len(dialogue) != 2:
            print('Number of speakers is not 2', 'skipping')
            # print(dialogue)
            continue

        consolidated_dialogues.append({'title': dialogue[0]['title'], 
                            'subtitle':  dialogue[0]['subtitle'], 
                            'question_speaker': dialogue[0]['speaker'], 
                            'answer_speaker': dialogue[1]['speaker'], 
                            'question': dialogue[0]['content'], 
                            'answer': dialogue[1]['content'],
                            'filename': dialogue[0]['filename']
                            })
        
    return consolidated_dialogues                              

In [177]:
consolidated_dialogues = convert_to_question_answer_pairs("docs/sitting_2024-05-08.json")

Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping


In [None]:
consolidated_dialogues[:5]

In [206]:
from pathlib import Path
input_path = Path('docs/')

In [207]:
hansard_2020_paths = input_path.glob('sitting_202*.json')

In [208]:
consolidated_dialogues = []
for fp in hansard_2020_paths:
    consolidated_dialogues += convert_to_question_answer_pairs(fp)

Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
Number of speakers is not 2 skipping
N

In [211]:
with open('./written_question_answers.jsonl', 'w') as f:
    for entry in consolidated_dialogues:
          f.write(json.dumps(entry)+'\n')

In [182]:
import pandas as pd

In [185]:
df = pd.DataFrame(consolidated_dialogues)

In [187]:
!pip install tiktoken --upgrade

Defaulting to user installation because normal site-packages is not writeable
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
  Attempting uninstall: tiktoken
    Found existing installation: tiktoken 0.6.0
    Uninstalling tiktoken-0.6.0:
      Successfully uninstalled tiktoken-0.6.0
Successfully installed tiktoken-0.7.0


In [188]:
import tiktoken
encoder = tiktoken.encoding_for_model("gpt-4o")

In [190]:
df['answer_length'] = df['answer'].apply(lambda x: len(encoder.encode(x)))

In [191]:
df['answer_length'].describe()

count    7011.000000
mean       64.163457
std        37.111016
min         0.000000
25%        37.000000
50%        60.000000
75%        87.000000
max       271.000000
Name: answer_length, dtype: float64

In [204]:
for _, row in df.sample(20).iterrows():
    print('Q:\t',row['question'])
    print('\n')
    print('A:\t', row['answer'])
    print('-------------------------')

Q:	 Mr Christopher de Souza asked the Minister for Health how is Singapore collaborating with the global scientific community to create a COVID-19 vaccine.


A:	 Apart from vaccine development, Singapore has also been involved in the research and development of therapeutics to improve the treatment of COVID-19 patients. For example, the National Centre for Infectious Diseases (NCID) and public hospitals are participating in multi-national clinical trials involving the experimental drug, Remdesivir, which has shown to improve the recovery of COVID-19 patients in some studies.
-------------------------
Q:	 Mr Chong Kee Hiong asked the Minister for Manpower (a) what is the progress of the Tripartite Advisory on reasonable accommodations for persons with disabilities; (b) what is the target date for the advisory to be ready; (c) whether the Ministry will consider including the advisory's guidelines as part of the Workplace Fairness Legislation; and (d) how will the Ministry measure whether