In [16]:
#!pip install openai
#!pip install python-dotenv
#!pip install tenacity

#### Instructions to save openai API key

1. Create a .env file in the same folder as your script (save a .txt file in the .env format).
2. The .env file should contain the following:
- OPENAI_API_KEY = "exampleapikey123"

In [174]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff (to overcome rate limit)

import json

In [132]:
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [44]:
speeches = pd.read_csv("~/Documents/Work/dsaid-hackathon23-illuminati/data/preprocessing/website/MTI/MTI_speeches_PQs_scraped.csv")

In [157]:
#Exponential backoff decorator
@retry(wait=wait_random_exponential(min=40, max=80), stop=stop_after_attempt(10))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [158]:
#Helper function to get returns from gpt
def get_message_completion(messages, 
                     model="gpt-3.5-turbo", 
                     temperature=0, 
                     max_tokens=500,
                     num_pairings=100
                    ):

    response = completion_with_backoff(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
        max_tokens=max_tokens, # the maximum number of tokens the model can ouptut 
    )
    return response.choices[0].message["content"]

### Step 1. Summarise relevant lines

This is to create a more focused corpus out of documents that contain other unrelated info. 

Note: This section will take some time as we have deliberately slowed down the process to overcome Openai's rate limit.

In [159]:
extracts = {}

In [160]:
for i in range(len(speeches['Content'])):
    
    try:
        #Set messages for chatgpt
        messages =  [  
        {'role':'system', 
         'content':f"""Please refer to the speech provided: {speeches['Content'][i].strip()}"""},    
        {'role':'user', 
         'content':f"""Summarise the most relevant lines related to solar energy or solar panels. \
         Make sure to retain key statistics or figures."""},  
        ] 
        
        #Obtain gpt response
        response = get_completion_from_messages(messages, max_tokens = 500)
        
        #Save to dictionary
        extracts[speeches['Title'][i]] = response
    except Exception as e:
        print(f"'{speeches['Title'][i]}' was not summarised. Error message: {e}'")

'Speech by Minister Gan Kim Yong at the Joint Segment on the Singapore Green Plan Committee of Supply Debate 2022' was not summarised. Error message: Rate limit reached for default-gpt-3.5-turbo in organization org-SHvsJN8OOskXue0vz7OnYqdY on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.'
'Speech by MOS Low Yen Ling at the Opening Ceremony of Sembcorp Tuas Solar Farm' was not summarised. Error message: Rate limit reached for default-gpt-3.5-turbo in organization org-SHvsJN8OOskXue0vz7OnYqdY on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/ac

'Oral reply to PQ on energy and electricity' was not summarised. Error message: Rate limit reached for default-gpt-3.5-turbo in organization org-SHvsJN8OOskXue0vz7OnYqdY on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.'
'Written reply to PQs on Sun Cable' was not summarised. Error message: Rate limit reached for default-gpt-3.5-turbo in organization org-SHvsJN8OOskXue0vz7OnYqdY on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.'


In [161]:
len(extracts)

11

In [220]:
extracts_df = pd.DataFrame(
    [(k, val) for k, val in extracts.items()], 
    columns=['Title', 'Summary']
)

extracts_df.to_csv("MTI_speeches_PQs_summaries_sample.csv")

### Step 2. Generate question and answer pairings from summaries

Note: This section will take some time as we have deliberately slowed down the process to overcome Openai's rate limit.

In [213]:
qna = {}
num_qna = 10 #Set number of Q&As

In [214]:
for t, s in extracts.items():
    
    try:
        #Set messages for chatgpt
        messages =  [  
        {'role':'system', 
         'content':f"""Please refer to the content provided: {s.strip()}"""},    
        {'role':'user', 
         'content':f"""# Create a JSON of {num_qna} pairs of questions and answers based on this summary. \
         The key value pairs should be the question and answer."""},  
        ] 
        
        #Obtain gpt response
        response = get_completion_from_messages(messages, max_tokens = 1000)
        
        #Convert response to JSON and then dictionary
        qna_dict = dict(json.loads(response))
        
        #Create list of tuples based on q&a pairings and save to dictionary
        qna[t] = [(qna_dict[k]['question'], qna_dict[k]['answer']) for k in qna_dict.keys()] 
        
    except Exception as e:
        print(f"Did not generate Q&As for'{t}'. Error message: {e}'")
        print(response)

Did not generate Q&As for'Speech by 2M Tan See Leng at the Asia Clean Energy Summit'. Error message: The server had an error while processing your request. Sorry about that!'
{
  "1": {
    "question": "What is Singapore's most viable form of renewable energy?",
    "answer": "Solar energy is Singapore's most viable form of renewable energy."
  },
  "2": {
    "question": "What are the constraints for harnessing solar energy in Singapore?",
    "answer": "Singapore has heavy cloud cover and limited land available to harness solar energy."
  },
  "3": {
    "question": "What measures has Singapore put in place to reduce the power sector's carbon footprint?",
    "answer": "Singapore has put in place measures to enhance the energy efficiency of existing power generation plants."
  },
  "4": {
    "question": "What is the primary source of electricity generation in Singapore?",
    "answer": "Currently, about 95% of Singapore's electricity is generated using natural gas, the cleanest foss

In [215]:
len(qna)

10

In [217]:
#Create df and save to csv
qna_df = pd.DataFrame(qna)
qna_melt = pd.melt(qna_df, value_vars = [k for k in qna.keys()], var_name = "Title")
qna_melt['Questions'], qna_melt['answers'] = zip(*qna_melt['value'])
qna_melt = qna_melt.drop('value', axis = 1)


qna_melt.to_csv("MTI_speeches_PQ_qna_sample.csv")

### To-do / issues:

1. Quite a bit of duplicate content. Should we remove those? Possible to remove using Chatgpt/langchain?
2. Rate limit of 3/min and max tokens (4097) is very low and limiting. Could try to batch requests? Exponential backoff method didn't quite work.
3. Other errors as above (e.g. server error)