In [89]:
import os
import pprint
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS

from utils import show_context

# load API keys defined in .env
load_dotenv()

True

In [None]:
file_path = "data/jde-peets-annual-report-2024.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

# getting only financial pages because of rate limit on free tier for OpenAIEmbeddings
pages = pages[27:31]

print(f"Grab {len(pages)} pages")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function = len
)
chunks = text_splitter.split_documents(pages)
print(f"Split into {len(chunks)} chunks")

# This operation consumes CREDITS!!
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = FAISS.from_documents(chunks, embeddings)
pprint.pprint(vectorstore.__dict__)
# save db to avoid creation again, consuming credits
vectorstore.save_local("faiss_index")

retriever = vectorstore.as_retriever(search_kwargs={"k": 2}) # k is how many chunks shoul return

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

Grab 4 pages
Split into 12 chunks
{'_normalize_L2': False,
 'distance_strategy': <DistanceStrategy.EUCLIDEAN_DISTANCE: 'EUCLIDEAN_DISTANCE'>,
 'docstore': <langchain_community.docstore.in_memory.InMemoryDocstore object at 0x0000020F9CB56420>,
 'embedding_function': GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x0000020F9CAB50D0>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None),
 'index': <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000020F9CB57180> >,
 'index_to_docstore_id': {0: 'cd431b33-2b53-4ff4-b5b4-46b420043e5d',
                          1: '62b3b15a-6ee7-4fed-8c7c-4f56f1c8b31b',
                          2: 'fe06471c-6a10-4c74-a1f6-9eaacb5e7669',
                          3: 'aa0f4dd6-5b50-489f-ad56-f9f32b9d304c',
       

In [93]:
# TEST retrieving
results = retriever.get_relevant_documents("What is the net debt?")
context_text = "\n".join([doc.page_content for doc in results])
context_text

"Underlying profit - excluding all adjusting items net of tax - decreased by -0.7% to EUR 729 million. This performance \nwas mainly driven by an unfavourable non-tax deductible impact of EUR 154 million from a fair value change in the \nCompany's equity derivatives, due to the decrease in the Company's share price in 2024.\nNet leverage was 2.73x (net debt to adjusted EBITDA), despite currency headwinds, with a net debt of EUR 4.3 billion \non 31 December 2024.\nOur liquidity position remains strong, with total liquidity of EUR 2.7 billion consisting of a cash position of EUR 1.2 \nbillion and an available committed RCF of EUR 1.5 billion. \nIn EUR million, unless otherwise stated:\n2024 2023\nSales 8,837 8,191\nOrganic change  5.3 %\nOperating profit 1,056 685\nFinancial income and expenses (263) (143)\nShare of net profit / (loss) of associates (3) (5)\nIncome tax expense (247) (173)\nNet income 543 364\n \nAdjusted EBIT 1,277 1,128\nOrganic change  10.4 %\nAdjusted EBITDA 1,587 1,4

In [None]:
prompt = PromptTemplate.from_template("""
    Provide an answer based on the information passed in the context
    
    Question: {question}
    Retrieved Context: {context}
""")

eval_chain = (
    prompt 
    | llm 
)

question = "What is the value of the net debt?"

# retrieving relevant context
results = retriever.get_relevant_documents(question)
context_text = "\n".join([doc.page_content for doc in results])

ai_res = eval_chain.invoke({
    "question": question,
    "context": context_text
})