In [1]:
#!pip install openai
#!pip install python-dotenv
#!pip install tenacity

#### Instructions to save openai API key

1. Create a .env file in the same folder as your script.
2. The .env file should contain the following:
- OPENAI_API_KEY = "exampleapikey123"

In [2]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff (to overcome rate limit)

import json
import time

In [3]:
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [4]:
speeches = pd.read_csv("~/Documents/Work/dsaid-hackathon23-illuminati/data/preprocessing/website/MTI/MTI_speeches_PQs_scraped.csv")

In [18]:
#Exponential backoff decorator
@retry(wait=wait_random_exponential(min=10, max=80), stop=stop_after_attempt(10))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [19]:
#Helper function to get returns from gpt
def get_message_completion(messages, 
                     model="gpt-3.5-turbo", 
                     temperature=0, 
                     max_tokens=500,
                     num_pairings=100
                    ):

    response = completion_with_backoff(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
        max_tokens=max_tokens, # the maximum number of tokens the model can ouptut 
    )
    return response.choices[0].message.content

### Step 1. Summarise relevant lines

This is to create a more focused corpus out of documents that contain other unrelated info. 

Note: This section will take some time as we have deliberately slowed down the process to overcome Openai's rate limit.

In [22]:
extracts = {}

In [23]:
for i in range(len(speeches['Content'])):
    
    try:
        #Set messages for chatgpt
        messages =  [  
        {'role':'system', 
         'content':f"""Please refer to the speech provided: {speeches['Content'][i].strip()}"""},    
        {'role':'user', 
         'content':f"""Summarise the most relevant lines related to solar energy or solar panels. \
         Make sure to retain key statistics or figures."""},  
        ] 
        
        #Obtain gpt response
        response = get_message_completion(messages, max_tokens = 500)
        
        #Save to dictionary
        extracts[speeches['Title'][i]] = response
    except Exception as e:
        print(f"'{speeches['Title'][i]}' was not summarised. Error message: {e}'")
    
    #time.sleep(20)

'Speech by Minister Gan Kim Yong at the Joint Segment on the Singapore Green Plan Committee of Supply Debate 2022' was not summarised. Error message: RetryError[<Future at 0x203330a5d90 state=finished raised InvalidRequestError>]'
'Speech by Second Minister for Trade and Industry Tan See Leng at Ministry of Trade and Industry (MTI)'s Committee of Supply Debate 2023' was not summarised. Error message: RetryError[<Future at 0x20333094fa0 state=finished raised InvalidRequestError>]'
'Ministerial Statement by Second Minister for Trade and Industry Dr Tan See Leng for Parliament Sitting on 4 April 2022' was not summarised. Error message: RetryError[<Future at 0x203331c2790 state=finished raised InvalidRequestError>]'


In [25]:
len(extracts)

27

In [26]:
extracts_df = pd.DataFrame(
    [(k, val) for k, val in extracts.items()], 
    columns=['Title', 'Summary']
)

extracts_df.to_csv("MTI_speeches_PQs_summaries_sample.csv")

### Step 2. Generate question and answer pairings from summaries

Note: This section will take some time as we have deliberately slowed down the process to overcome Openai's rate limit.

In [27]:
qna = {}
num_qna = 10 #Set number of Q&As

In [29]:
for t, s in extracts.items():
    
    try:
        #Set messages for chatgpt
        messages =  [  
        {'role':'system', 
         'content':f"""Please refer to the content provided: {s.strip()}"""},    
        {'role':'user', 
         'content':f"""# Create a JSON of {num_qna} pairs of questions and answers based on this summary. \
         The key value pairs should be the question and answer."""},  
        ] 
        
        #Obtain gpt response
        response = get_message_completion(messages, max_tokens = 1000)
        
        #Convert response to JSON and then dictionary
        qna_dict = dict(json.loads(response))
        
        #Create list of tuples based on q&a pairings and save to dictionary
        qna[t] = [(qna_dict[k]['question'], qna_dict[k]['answer']) for k in qna_dict.keys()] 
        
    except Exception as e:
        print(f"Did not generate Q&As for'{t}'. Error message: {e}'")
    
    #time.sleep(20)

In [30]:
len(qna)

27

In [31]:
#Create df and save to csv
qna_df = pd.DataFrame(qna)
qna_melt = pd.melt(qna_df, value_vars = [k for k in qna.keys()], var_name = "Title")
qna_melt['Questions'], qna_melt['answers'] = zip(*qna_melt['value'])
qna_melt = qna_melt.drop('value', axis = 1)


qna_melt.to_csv("MTI_speeches_PQ_qna_sample.csv")

### To-do / issues:

1. Quite a bit of duplicate content. Should we remove those? Possible to remove using Chatgpt/langchain?
2. Rate limit of 3/min and max tokens (4097) is very low and limiting. Tried exponential backoff method + time.sleep but script runs very slowly as a result.
3. InvalidREquestError for 3 documents - max tokens exceeded.