In [172]:
"""Experimenting implementing RAG system with LangChain"""
import re

import dotenv

import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.documents.base import Document


# Load OpenAI API key from .e|nv file
dotenv.load_dotenv()

True

In [40]:
VECTOR_DATABASE_DIR = "/home/tomw/unifi-pdf-llm/dev/chroma"
"""Directory containing the chroma vector database for each PDF file."""

AMKEY_STANDARD_METRIC_MAPPING_FILE = "/home/tomw/unifi-pdf-llm/data/AMKEY_GoldenStandard.csv"
"""File mapping AMKEY to the standard description of the metric."""

AMKEY_CLIENT_METRIC_MAPPING_FILE = "/home/tomw/unifi-pdf-llm/data/ActivityMetricsSynonyms.csv"
"""File mapping AMKEY to the client description of the metric, if different from the standard description."""

# Load the AMKEY to standard metric mapping
amkey_standard_metric_mapping = pd.read_csv(AMKEY_STANDARD_METRIC_MAPPING_FILE, index_col="AMKEY")

# Load the AMKEY to client metric mapping (index should be AMKEY and Group)
amkey_client_metric_mapping = pd.read_csv(AMKEY_CLIENT_METRIC_MAPPING_FILE, index_col=["AMKEY", "Group"])

In [41]:
COMPANY = "Sasol"
"""Name of the company to be queried. Must match the name in the PDF file."""

PDF_PATH = "/home/tomw/unifi-pdf-llm/data/train/SASOL Sustainability Report 2023 20-09_0.pdf"
"""Path to the PDF file to be queried."""

PDF_FILENAME = PDF_PATH.split("/")[-1]
"""Filename of the PDF file to be queried."""

'Filename of the PDF file to be queried.'

## Indexing

In [42]:
# Load - load the PDF file and split it into pages
loader = PyPDFLoader(PDF_PATH)
pages = loader.load_and_split()

print(f'Number of pages: {len(pages)}')

Number of pages: 109


In [43]:
# Split - create a list of text splits from the pages
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(pages)

print(f'Number of splits: {len(all_splits)}')

Number of splits: 778


In [44]:
# Store - embed and store the splits in a vector database
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

## Retrieval and Generation

In [115]:
AMKEY = 506
"""Activity metric key."""

def get_client_metric(amkey: int, group: str) -> str | None:
    """
    Return the 'ClientMetric' for the given AMKEY and company, if it exists.
    """
    try:
        client_metric = amkey_client_metric_mapping.loc[amkey, group]['ClientMetric']
    except KeyError:
        client_metric = None

    return client_metric

def get_standard_metric(amkey: int) -> str:
    """
    Return the 'StandardMetric' for the given AMKEY.

    Raises
    ------
    ValueError
        If the AMKEY is not found in the standard metric mapping.
    """
    try:
        metric = amkey_standard_metric_mapping.loc[amkey]['ActivityMetric']
    except KeyError:
        raise ValueError(f"AMKEY {amkey} not found in the standard metric mapping.")

    return metric


def get_metric_description(amkey: int, group: str) -> str:
    """
    Return the metric description for the given AMKEY and company.
    """
    client_metric = get_client_metric(amkey, group)
    if client_metric is not None:
        metric_description = client_metric
    else:
        metric_description = get_standard_metric(amkey)

    return metric_description

In [70]:
vector_db_query = f"What is the {get_metric_description(AMKEY, COMPANY)}"

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke(vector_db_query)

retrieved_docs[0].page_content

"Industry and Competition's B-BBEE compliance targets for preferential procurement spend \nwith empowering suppliers, spending with 51% black-owned businesses as well as the spend \nwith 30% black women-owned businesses. In the almost two decades in which we have operated in Mozambique, we have driven \nsocio-economic development through our investments and operations. We are committed \nto further developing and using local Mozambican suppliers and labour and have partnered"

In [136]:
get_metric_description(AMKEY, COMPANY)

'Board meeting attendance rate'

In [135]:
# Generate - generate a response using the retrieved splits as context
TEMPLATE = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say 'None', don't try to make up an answer.
Only return the relevant number, without any additional text.

{context}

Question: {question}

Helpful Answer:"""


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

custom_rag_prompt = PromptTemplate.from_template(TEMPLATE)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke(f"What was the {get_metric_description(AMKEY, COMPANY)}. Please also provide the unit")

print(answer)

100%


In [54]:
# TODO: Parse the answer from the model

## Validating

In [110]:
TRAIN_DF = pd.read_csv("/home/tomw/unifi-pdf-llm/data/Train.csv")

COMPANY = "Picknpay"
YEAR = 2021

PDF_PATH = "/home/tomw/unifi-pdf-llm/data/test/picknpay-sustainable-living-report-2021.pdf"
"""Path to the PDF file to be queried."""

PDF_FILENAME = PDF_PATH.split("/")[-1]
"""Filename of the PDF file to be queried."""

# Restrict train_df to 'ID' of the form *_<COMPANY>
train_df = TRAIN_DF[TRAIN_DF['ID'].str.contains(f"_{COMPANY}")]

# Restrict to the columns 'ID' and <YEAR>_Value
train_df = train_df[['ID', f"{YEAR}_Value"]]

# Remove rows with NaN values
train_df.dropna(inplace=True)

train_df.reset_index(drop=True, inplace=True)

print(train_df.head())

               ID    2021_Value
0   46_X_Picknpay  4500000000.0
1   52_X_Picknpay         100.0
2  122_X_Picknpay           0.0
3  128_X_Picknpay      123421.0
4  129_X_Picknpay      766174.0


In [216]:
# Index

# Load - load the PDF file and split it into pages
loader = PyPDFLoader(PDF_PATH)
pages = loader.load_and_split()

print(f'Number of pages: {len(pages)}')

# Split - create a list of text splits from the pages
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(pages)

print(f'Number of splits: {len(all_splits)}')

# Store - embed and store the splits in a vector database
vectorstore.delete_collection()
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

Number of pages: 62
Number of splits: 441


In [217]:
type(vectorstore)

langchain_community.vectorstores.chroma.Chroma

In [218]:
vectorstore._collection.count()

441

In [219]:
all_splits

[Document(page_content='PICK N PAY SUSTAINABLE LIVING REPORT 2021\nSUSTAINABLE LIVING \nREPORT 2021picknpay.comPick n Pay and Boxer partnered with the Feed the \nNation Foundation to distribute food hampers to \nchild -headed families across South Africa .', metadata={'source': '/home/tomw/unifi-pdf-llm/data/test/picknpay-sustainable-living-report-2021.pdf', 'page': 0, 'start_index': 0}),
 Document(page_content='01\nABOUT US02\nCHAIRMAN’S MESSAGE03\nCEO’S MESSAGE04\nPEOPLE N PLANET05\nPARTNERING TO SHIFT  \nTHE FOOD SYSTEM06\nUSING LESS,  \nSHARING MORE07\nHELPING TO EMPOWER OUR \nEMPLOYEES AND COMMUNITIES 08\nSTRATEGY, MATERIALITY  \nAND GOVERNANCE09\nESG PERFORMANCE DATA\nCONTENTS01\nAbout us 1\n02\nA message from our Chair 3\n03\nMessage from the T ransformation \nDirector 5\n04\nPeople n Planet: work ing together \nfor a s ustainable future 7\n05\nPartnering to shi ft the  \nfood system 11\n06\nUsing less, sharing more 25\n07', metadata={'source': '/home/tomw/unifi-pdf-llm/data/tes

In [220]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

metric_description = "supplier awards"
print(metric_description)

context = clean_text(format_docs(retriever.invoke(metric_description)))
print(textwrap.fill(context))

supplier awards
Western Cape Entrepreneurship Recognition Awards for her business
which currently employs 56 people most of whom are young women from
the local townshipCelebrating local innovation and entrepreneurship
Every year we hold the Pick n Pay small supplier awards celebrating
local success stories and entrepreneur role models The following six
suppliers were declared overall winners for 2020 1 Nutriseed 2 Curated
Beverages 3 Cre8tive Footwear 4 Greenworld Chemicals based Township
Patterns had developed sufficiently though our supplier development
programme to secure alternative contracts mentorship programme This is
delivered through our Small Supplier Mentorship app which we launched
in November 2019 The peer learning platform is designed to provide
small suppliers with access at no cost to expertise within Pick n Pay
and features tips tools and training to assist with the mentoring
process We currently have more than 150 entrepreneurs enrolled in our
ESD programme who are co

In [233]:
import textwrap


TEMPLATE = """Use the following pieces of context to answer the question at the end.
If there are two answers to the question, please provide both of them.
If you don't know the answer, just say 'None', don't try to make up an answer.
Only return the relevant number, without any additional text.

{context}

Question: {question}

Helpful Answer:"""


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    text = text.strip()
    return text


def form_question(metric_description: str) -> str:
    return f"What was the {metric_description}?"


def retrieve_and_generate(amkey: int, company: str, vectorstore: Chroma) -> str:
    """
    Generate a response using the retrieved splits as context.
    """
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    custom_rag_prompt = PromptTemplate.from_template(TEMPLATE)

    metric_description = get_metric_description(amkey, company)

    # Use loguru and make this a debug message
    print(metric_description)
    context = clean_text(format_docs(retriever.invoke(metric_description)))
    print(textwrap.fill(context))

    rag_chain = (
        {"context": retriever | format_docs | clean_text, "question": form_question}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

    metric_description = get_metric_description(amkey, company)

    answer = rag_chain.invoke(metric_description)

    return answer

In [234]:
retrieve_and_generate(796, COMPANY, vectorstore)

Number of non-executive board members
at enhancing our ability to meet the evolving disclosure expectations
of our key stakeholdersCrossfunctional leadership Executive
sustainability steering committeeExecutive leadership Group
executivesSustainability governance Oversight Operational
implementationBoard accountabilityPick n Pay Stores Limited Board 14
directorsAudit risk and compliance committeeCorporate governance
committeeSocial and ethics committee Crossfunctional working group
Percentage of Board m embers who are deemed Historically Disadvantaged
South Africans HDSAPnP Group 36 40 Percentage of Board m embers who
are w omen PnP Group 36 33 Board m eeting attendance PnP Group 100 100
Social and ethics committee m eeting attendance PnP Group 100 50
Labour Number of c orporate employees PnP Group 55 200 54 900
Percentage of e mployees in South Africa PnP Group 85 83 Percentage of
e mployees in rest of Africa PnP Group 15 17 Number of stores in Rest
of Africa PnP Group 163 154 Number 

'10'

In [235]:
# Loop over the rows of train_df and retrieve and generate for each AMKEY
validate_df = train_df.copy(deep=True)

validate_df['Generated'] = None
validate_df['MetricDescription'] = None

for idx, row in train_df.iterrows():
    amkey = int(row['ID'].split("_")[0])
    print(f"AMKEY: {amkey}")
    metric_description = get_metric_description(amkey, COMPANY)
    answer = retrieve_and_generate(amkey, COMPANY, vectorstore)

    # If answer can be converted to a float, do it
    try:
        answer = float(answer)
    except ValueError:
        pass

    print(metric_description)
    print(answer)

    validate_df.loc[idx, 'Generated'] = answer
    validate_df.loc[idx, 'MetricDescription'] = metric_description


AMKEY: 46
BBBEE procurement spend from Exempt Micro Enterprises (EMEs), Qualifyimg Small Enterprises (QSEs) & Empowering Suppliers that are >51 black owned
jobs are BBBEE compliant and ensure that their raw materials are
locally produced manufactured assembled or packaged Participating
suppliers feature a Pick Local brand stamp on their products We
encourage all kinds of small businesses be it a family business local
shop online business farmer small manufacturer or entrepreneur to get
involved In promoting Pick Local we raise levels of customer awareness
that by buying R48bnR52bn R2bn FY21 FY20 FY19Spend on blackowned
businesses R84bnR77bn R4bn FY21 FY20 FY19 Spend on blackowned SMMEs
R27bn R26bn R830m FY21 FY20 FY19Spend on SMMEs R46bn R45bn R15bn FY21
FY20 FY19Spend on BBBEE compliant businesses R52bn R545bnR51bn FY21
FY20 FY19 Big business has a major role and responsibility to play in
building and developing small businesses Transforming the supply chain
from soil to shelf and cre

In [236]:
validate_df

Unnamed: 0,ID,2021_Value,Generated,MetricDescription
0,46_X_Picknpay,4500000000.0,,BBBEE procurement spend from Exempt Micro Ente...
1,52_X_Picknpay,100.0,100.0,Board meeting attendance rate
2,122_X_Picknpay,0.0,,Fatal injury frequency rate (FIFR)
3,128_X_Picknpay,123421.0,123421.0,GHG Scope 1 emissions
4,129_X_Picknpay,766174.0,766 174,GHG Scope 2 emissions
5,130_X_Picknpay,482615.0,482 615,GHG Scope 3 emissions
6,151_X_Picknpay,2.3,,Lost-time injury frequency rate (LTIFR)
7,156_X_Picknpay,102.0,,"Number of Medical Treatment Cases (MTCs, i.e. ..."
8,216_X_Picknpay,0.0,0.0,Number of environmental incidents with a negat...
9,219_X_Picknpay,0.0,0.0,Number of work-related fatalities
