In [109]:
import os
import pprint
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS

from utils import show_context

# load API keys defined in .env
load_dotenv()

True

In [111]:
file_path = "data/jde-peets-annual-report-2024.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

# getting only financial pages because of rate limit on free tier for OpenAIEmbeddings
pages = pages[27:31]

print(f"Grab {len(pages)} pages")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function = len
)
chunks = text_splitter.split_documents(pages)
print(f"Split into {len(chunks)} chunks")

# This operation consumes CREDITS!!
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = FAISS.from_documents(chunks, embeddings)
# pprint.pprint(vectorstore.__dict__)
# save db to avoid creation again, consuming credits
vectorstore.save_local("faiss_index")

retriever = vectorstore.as_retriever(search_kwargs={"k": 2}) # k is how many chunks shoul return

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

prompt = PromptTemplate.from_template("""
    Provide an answer based on the information passed in the context
    
    Question: {question}
    Retrieved Context: {context}
""")

eval_chain = (
    prompt 
    | llm 
)

Grab 4 pages
Split into 12 chunks


In [112]:
# question = "What is the value of the net debt?" # 4.3 B, first test
question = "Pull key financial values"

# retrieving relevant context
results = retriever.get_relevant_documents(question)
context_text = "\n".join([doc.page_content for doc in results])

ai_res = eval_chain.invoke({
    "question": question,
    "context": context_text
})

pprint.pprint(ai_res.content) # hermoso!!

('Key financial values from the provided context:\n'
 '\n'
 '*   **Sales (2024):** EUR 8,837 million\n'
 '*   **Sales (2023):** EUR 8,191 million\n'
 '*   **Organic Sales Growth:** 5.3%\n'
 '*   **Operating Profit (2024):** EUR 1,056 million\n'
 '*   **Operating Profit (2023):** EUR 685 million\n'
 '*   **Financial Income and Expenses (2024):** EUR (263) million\n'
 '*   **Financial Income and Expenses (2023):** EUR (143) million\n'
 '*   **Net Income (2024):** EUR 543 million\n'
 '*   **Net Income (2023):** EUR 364 million\n'
 '*   **Adjusted EBIT (2024):** EUR 1,277 million\n'
 '*   **Adjusted EBIT (2023):** EUR 1,128 million\n'
 '*   **Organic Adjusted EBIT Growth:** 10.4%\n'
 '*   **Adjusted EBITDA (2024):** EUR 1,587 million\n'
 '*   **Adjusted EBITDA (2023):** EUR 1,426 million\n'
 '*   **Underlying Profit (excluding adjusting items net of tax):** EUR 729 '
 'million (decreased by -0.7%)\n'
 '*   **Net Debt:** EUR 4.3 billion (as of December 31, 2024)\n'
 '*   **Net Leverage:** 2

In [None]:
from pydantic import BaseModel, Field

class FinancialData(BaseModel):
    """Always use this schema to structure your response to the user."""
    current_sales: float = Field(description="Total business sales in the current year")
    previous_sales: float = Field(description="Total business sales in the previous year")
    sales_growth: float = Field(description="Percentage growth of sales in the current year, that is the ratio of the sales of the current year vs. the previous expressed as a percentage") # aqui manda fruta...
    organic_sales_growth: float = Field(description="Organic sales growth, excluding the effect of foreign exchange") # aqui manda fruta...
    net_income: float = Field(description="Net business income in the current year")
    cash_position: float = Field(description="Cash position of the bussines")

llm_struct_out = llm.with_structured_output(FinancialData)

eval_chain_str = (
    prompt 
    | llm_struct_out 
)

question = "Pull key financial values"

# retrieving relevant context
results = retriever.get_relevant_documents(question)
context_text = "\n".join([doc.page_content for doc in results])

ai_res = eval_chain_str.invoke({
    "question": question,
    "context": context_text
})

pprint.pprint(ai_res)

FinancialData(current_sales=8837000000.0, previous_sales=8191000000.0, sales_growth=7.9, net_income=543000000.0, cash_position=1200000000.0)
