In [3]:
import dotenv

# Load OpenAI API key from .env file
dotenv.load_dotenv()

True

In [84]:
PDF_PATH = "../data/train/SASOL Sustainability Report 2023 20-09_0.pdf"

## Indexing

In [85]:
# Load

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(PDF_PATH)
pages = loader.load_and_split()

print(f'Number of pages: {len(pages)}')

Number of pages: 109


In [86]:
# Split

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(pages)

print(f'Number of splits: {len(all_splits)}')

Number of splits: 778


In [87]:
# Store

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

## Retrieval and Generation

In [88]:
# Retrieve

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [89]:
retrieved_docs = retriever.invoke("What is Number of work-related fatalities")
retrieved_docs[4].page_content

'– Employee  – \n– Service provider  – \nLost Workday Case Rate (LWDCR)  0,13  0,10  0,14  0,11 Limited\n– Employee  0,18  0,12  0,16  0,14 \n– Service provider  0,10  0,08  0,11  0,08 \nENERGY\nLost Work Day Case Rate (LWDCR)  0,13  0,10 \n– Employee  0,19  0,12 \n– Service provider  0,09  0,08 \nCHEMICALS\nLost Work Day Case Rate (LWDCR)  0,13  0,15 \n– Employee  0,10  0,19 \n– Service provider  0,18  0,08 \nCORPORATE CENTRE\nLost Work Day Case Rate (LWDCR)  – \n– Employee  – \n– Service provider  – \nEmployee and service provider fatalities  2  5  2  6 Limited\n– Employee  1  4  1  3\n– Service provider  1  1  1  3 \nENERGY\nEmployee and service provider fatalities  2  4\n– Employee  1  4 \n– Service provider  1  – \nCHEMICALS\nEmployee and service provider fatalities  –  1 \n– Employee  –  – \n– Service provider  –  1 \nCORPORATE CENTRE\nEmployee and service provider fatalities  –  – \n– Employee  –  – \n– Service provider  –  – \nEmployee and service provider fatal injury frequenc

In [90]:
retrieved_docs[0].page_content

'CORPORATE CENTRE\nLost Work Day Case Rate (LWDCR)  – \n– Employee  – \n– Service provider  – \nEmployee and service provider fatalities  2  5  2  6 Limited\n– Employee  1  4  1  3\n– Service provider  1  1  1  3 \nENERGY\nEmployee and service provider fatalities  2  4\n– Employee  1  4 \n– Service provider  1  – \nCHEMICALS\nEmployee and service provider fatalities  –  1 \n– Employee  –  – \n– Service provider  –  1 \nCORPORATE CENTRE\nEmployee and service provider fatalities  –  – \n– Employee  –  –'

In [91]:
# Generate

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [123]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

TEMPLATE = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say 'None', don't try to make up an answer.
Only return the relevant number, without any additional text.

{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(TEMPLATE)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke("What was the B-BBEE (RSA only) - Black-owned spend. Please also provide the unit")

In [124]:
answer

'41 700 (Rands million)'

In [98]:
if answer == "None":
    result = 0.0
else:
    result = answer.replace(" ", "")

print(float(result))

1725.0
