In [2]:
import os
import getpass


os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass()
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://ai-capdev-oai-eastus-gcc2.openai.azure.com/"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-05-01-preview"
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "gpt-4o"

In [3]:
from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI

model = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
)

In [4]:
message = HumanMessage(
    content="Translate this sentence from English to French. I love programming."
)
model.invoke([message])

AIMessage(content="J'adore la programmation.", response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 19, 'total_tokens': 24}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_abc28019ad', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}, id='run-19ea6cd5-c52d-44c3-8a86-f5241d217403-0', usage_metadata={'input_tokens': 19, 'output_tokens': 5, 'total_tokens': 24})

In [5]:
def prompt_model(prompt):
    return model.invoke([prompt]).content

In [19]:
prompt_template = """Given the following question and answer pair, do the following:
1. Check if the answer addresses the question. If the answer says something like "This question has been asked before, please refer to another answer." The status will be "not_answered" and the reference_answer will be the extracted answer link or document id.
2. If the answer addresses the question, the status will be "answered".
3. If the answer addresses the question, summarise the answer in succint and concise point forms in relation to the question.

Give your answer as a JSON output with the following keys:

"status": answered or not_answered,
"reference_answer": reference_link_or_id or "",
"summary_points": summarised points or ""

This is the question:
{question}

And this is the answer:
{answer}
"""

In [20]:
import json
with open('data/written_question_answers.jsonl', 'r') as f:
    lines = f.readlines()
json_lines = [json.loads(l) for l in lines]

In [21]:
json_lines[0]

{'title': 'Size of Job Market for Information and Communications Sector',
 'subtitle': None,
 'question_speaker': 'Mr Gerald Giam Yean Song',
 'answer_speaker': 'Dr Tan See Leng',
 'question': 'Mr Gerald Giam Yean Song asked the Minister for Manpower (a) whether the job market in the information and communications sector is shrinking; and (b) if so, whether the Ministry has correspondingly reduced the number of employment passes and S-passes issued to foreigners seeking jobs in this sector. ',
 'answer': 'In summary, MOM does not and should not micro-manage how the job market functions by directly and arbitrarily adjusting the number of EPs and S Passes issued in response to business fluctuations. Adopting such a protectionist measure would cause significant business uncertainty, and undermine Singapore’s reputation as a transparent, competitive, and reliable location for businesses. It would go against how we have thrived over the decades by being an open economy and a business hub. I

In [22]:
test_prompt = prompt_template.format(question=json_lines[0]['question'], answer=json_lines[0]['question'])

In [24]:
output = prompt_model(prompt_template.format(question=json_lines[1]['question'], answer=json_lines[1]['answer']))
output

'```json\n{\n  "status": "answered",\n  "reference_answer": "",\n  "summary_points": [\n    "An average of 25 officers in the Civil Service utilised unpaid leave for parents with multiple or pre-term babies each year in the past five years."\n  ]\n}\n```'

In [25]:
def clean_json_output(output):
    output = output.strip()
    if output.startswith("```json"):
        output = output[7:]
    if output.endswith("```"):
        output = output[:-3]
    cleaned_output = output.strip()

    try:
        json_data = json.loads(cleaned_output)
    except json.JSONDecodeError as e:
        logging.error(f"JSON decoding error: {e}")
        return cleaned_output

    def clean_json(data):
        if isinstance(data, dict):
            return {key: clean_json(value) for key, value in data.items()}
        elif isinstance(data, list):
            return [clean_json(item) for item in data]
        elif isinstance(data, str):
            return "" if data.lower() in ["unknown", "na", "null"] else data
        else:
            return data

    cleaned_json_data = clean_json(json_data)
    cleaned_output = json.dumps(cleaned_json_data, ensure_ascii=False)

    return cleaned_output

In [26]:
clean_json_output(output)

'{"status": "answered", "reference_answer": "", "summary_points": ["An average of 25 officers in the Civil Service utilised unpaid leave for parents with multiple or pre-term babies each year in the past five years."]}'

In [34]:
sample_lines = json_lines[:3]

from tqdm.auto import tqdm
all_data = []
with open('data/written_question_answers_processed.jsonl', 'a') as f:
    for index, jl in tqdm(enumerate(sample_lines), total=len(sample_lines)):
        json_out = clean_json_output(prompt_model(prompt_template.format(question=jl['question'], answer=jl['answer'])))
        try:
            json_data = json.loads(json_out)
        except json.JSONDecodeError as e:        
            logging.error(f"JSON decoding error: {e}")
            continue

        json_data["points"] = '\n\n'.join(json_data["summary_points"])
        for k,v in jl.items():
            if k in json_data:
                continue
            json_data[k] = v

        f.write(json.dumps(json_data)+'\n')                                 

  0%|          | 0/3 [00:00<?, ?it/s]

7011