In [None]:
#!pip install llama-index

In [1]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
import openai
import pandas as pd
import numpy as np

import os
from pathlib import Path
import json

from datetime import datetime, timedelta

In [2]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.indices.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor
)
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.response.notebook_utils import display_response
from llama_index.storage.storage_context import StorageContext

from llama_index import ListIndex

In [3]:
#import custom node processor
from custom_node_processor import CustomSolarPostprocessor 

In [4]:
""" LOAD OPENAI_API_KET FROM ENV """
_ = load_dotenv(find_dotenv(filename='apikey', usecwd=True)) 
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
#TO-DO
#load in docs from directory and mapping table
#Set metadata

In [5]:
#Create test mapping table
test_mapping = [('test_data/2021_PQ.txt', '2021-11-01', 'Minister'), 
                ('test_data/2022_EMA.txt', '2022-03-31', 'EMA'),
                ('test_data/2023_speech.txt', '2023-02-02', 'Minister')]

test_mapping = pd.DataFrame(test_mapping, columns = ['file_path', 'date', 'category'])
test_mapping

Unnamed: 0,file_path,date,category
0,test_data/2021_PQ.txt,2021-11-01,Minister
1,test_data/2022_EMA.txt,2022-03-31,EMA
2,test_data/2023_speech.txt,2023-02-02,Minister


In [6]:
#Load documents and add metadata
files_lst = list(test_mapping['file_path'].unique())

def get_file_metadata(file_name: str, mapping = test_mapping):
    """Get file metadata."""
    date = mapping[mapping['file_path'] == file_name]['date'].values[0]
    category = mapping[mapping['file_path'] == file_name]['category'].values[0]
    metadata = {'date': date, 'category': category}
    
    return metadata
    
documents = SimpleDirectoryReader(
    input_files = files_lst,
    #file_metadata = get_file_metadata
).load_data()

for i, doc in enumerate(documents):
    file_path = test_mapping.iloc[i]['file_path']
    doc.extra_info = get_file_metadata(file_path)
    
# define service context (wrapper container around current classes)
service_context = ServiceContext.from_defaults(chunk_size=512)

# use node parser in service context to parse into nodes
nodes = service_context.node_parser.get_nodes_from_documents(documents)

# add to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(docstore=docstore)

#print(documents[2].get_text())

In [7]:
# build index 
index = VectorStoreIndex(nodes, storage_context=storage_context)

In [8]:
node_postprocessor = CustomSolarPostprocessor(service_context=service_context, top_k_recency = 1, top_k_min = 2)

### High-level API

#### Test 1. Non-stats query - prioritise Minister

Relevant files:
1. 2022_EMA: Inserted "Singapore is on track to achieving our solar panel deployment target of at least 1.5 gigawatt-peak (GWp) by 2030. Solar installed capacity increased over eight-fold from 2015 to over 500 megawatt-peak (MWp) in the second quarter of 2021." --> changed 2 gigawatt-peak to 1.5
2. 2021_PQ: "Singapore is on track to achieving our solar panel deployment target of at least 2 gigawatt-peak (GWp) by 2030. Solar installed capacity increased over eight-fold from 2015 to over 500 megawatt-peak (MWp) in the second quarter of 2021." (original text)

Desired output:
This is not a stats query, hence even though the EMA file is more recent, we want the Minister's answer, i.e. at least 2 gigawatt-peak (GWp) by 2030.

In [9]:
# query using custom node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
    "What is the solar panel deployment target by 2030?", 
)
print(response)

Prediction on stats status: False

The solar panel deployment target by 2030 is at least 2 gigawatt-peak (GWp).


In [11]:
#Test with just the recency filter
node_postprocessor_recency = FixedRecencyPostprocessor(service_context=service_context, top_k = 2) 

query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor_recency]
)
response = query_engine.query(
    "What is the solar panel deployment target by 2030?", 
)
print(response)


The solar panel deployment target by 2030 is at least 1.5 gigawatt-peak (GWp).


#### Test 2. Stats query - take most recent file (should be same result as recency filter)

Relevant files:
1. 2023_speech: "After achieving our solar target of 350 megawatt-peak or MWp in 2020, we have since doubled our capacity to more than 700 MWp of solar installed today."
2. 2022_EMA: "Grid-connected installed solar capacity grew significantly from 125.0 MWp in 2016 to 670.0 MWp as at end Q1 2022."
3. 2021_PQ: "Solar installed capacity increased over eight-fold from 2015 to over 500 megawatt-peak (MWp) in the second quarter of 2021."

Desired output:
This is a stats query, hence we want the most recent document, i.e. more than 700 MWp.

In [12]:
# query using custom node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(
    "What is Singapore's solar capacity today?", 
)
print(response)

Prediction on stats status: True

Singapore's solar capacity today is more than 700 MWp.


In [13]:
#Test with just the recency filter
node_postprocessor_recency = FixedRecencyPostprocessor(service_context=service_context, top_k = 1) 

query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor_recency]
)
response = query_engine.query(
    "What is Singapore's solar capacity today?", 
)
print(response)


Singapore's solar capacity today is more than 700 MWp.


### Low-level API - could use this to inspect nodes but otherwise could do without

In [53]:
query_str = "What is Singapore's solar capacity today?"
query_engine = index.as_query_engine(
    similarity_top_k=3,
    response_mode="no_text"
)
init_response = query_engine.query(
    query_str, 
)
resp_nodes = [n.node for n in init_response.source_nodes]

In [54]:
list_index = ListIndex(resp_nodes)
query_engine = list_index.as_query_engine(
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)

Prediction on stats status: True


In [55]:
resp_nodes

[Node(text='\nSingapore is on track to achieving our solar panel deployment target of at least 1.5 gigawatt-peak (GWp) by 2030. Solar installed capacity increased over eight-fold from 2015 to over 500 megawatt-peak (MWp) in the second quarter of 2021.\n\nGrid-connected installed solar capacity grew significantly from 125.0 MWp in 2016 to 670.0 MWp as at end Q1 2022.\n\nThe majority of solar PV capacity as at end Q1 2022 was accounted for by non-residential private sector (61.7% of total installed capacity or 413.1 MWp), followed by town councils and grassroots (28.3% or 189.6 MWp). Installations from public service agencies (6.7% or 45.2 MWp) and residential installations (3.3% or 22.1 MWp) contributed to the remaining solar PV capacity.\n\nThere was a total of 5,455 solar PV installations in Singapore as at end Q1 2022. Town councils and grassroots units accounted for the majority (48.9% or 2,668) of total installations, followed by the residential (34.7% or 1,891 installations) and n

In [56]:
print(response)


Singapore's solar capacity today is over 500 megawatt-peak (MWp).
