In [209]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from openai import OpenAI
import pandas as pd 
import os, json, random, re

#USER INPUT
N_QUESTIONS = 1
PROJECT_NAME = "session_1"
INPUT_DIR = "finance_data"

#CONFIG
metadata_dir = os.path.join(PROJECT_NAME, "metadata.json")
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 100
N_PAGE_SUMMARY = 3
API_KEY = "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"

###### DELETE THIS ON PRODUCTION #########
#Remove session folder and everything in it
if os.path.exists(PROJECT_NAME) :
    import shutil
    shutil.rmtree(PROJECT_NAME)
    print(f"Folder '{PROJECT_NAME}' deleted.")
##########################################
    
#INITIALIZATION
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")
# Create folder if it doesn't already exist
CHUNKS_DIR = os.path.join(PROJECT_NAME, "chunks")
if not os.path.exists(PROJECT_NAME) :
    os.makedirs(PROJECT_NAME)
    os.makedirs(CHUNKS_DIR)
    print(f"Folder '{PROJECT_NAME}' created.")
else:
    print(f"Folder '{PROJECT_NAME}' already exists.")

Folder 'session_1' deleted.
Folder 'session_1' created.


In [210]:
def summary_extraction(first_n_pages):
    response = client.chat.completions.create(
    model="sonar",
    temperature=0,
    messages= [{
        "role": "system",
                "content": """
                You are a financial report assistant. 
                I will provide the first few pages of a financial report, and your task is to give a concise, single-sentence summary answering: 
                (1) which company the report is about and 
                (2) what year it covers. 
                Limit the summary to 50 words, with no extra details or formatting.
                """
    },
        {   
            "role": "user",
            "content":  f"""
            The first few pages {first_n_pages}
            Your response: 
            """
            
        },
    ])
    return response.choices[0].message.content.strip()


def generate_questions(summary, chunk):
    response = client.chat.completions.create(
        model="sonar",
        messages=[{
            "role": "system",
            "content": """
            You are a question generator. 
            I will provide a chunk of information along with its PDF context. 
            Your task is to generate one question  with the following requirement
            (1) The question should based solely on the chunk’s content
            (2) The question should include enough context from the summary (company name and year) to make it clear what the question is about.
            (3) Do not add any extra information. 
            (4) If the chunk lacks useful content, respond with an empty string.
            """
        },
        {
            "role": "user",
            "content": f"""
            PDF Summary {summary}. Chunk Text: {chunk}
            Your question:
            """
        }],
    )
    return response.choices[0].message.content.strip()

In [211]:
pdf_list = []

# Save metadata
if os.path.exists(metadata_dir):
    with open(metadata_dir, "r") as f:
        metadata = json.load(f)
else:
    metadata = []
all_filenames = [entry["file_name"] for entry in metadata]

# Loop through PDFs
for file_name in os.listdir(INPUT_DIR):
    if file_name.endswith(".pdf") and file_name not in all_filenames:
        file_path = os.path.join(INPUT_DIR, file_name)
        pdf_list.append(file_path)

        # 1) Load
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()

        # 2) Split
        splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        chunks = splitter.split_documents(documents)

        # 3) Summary
        first_n_pages = "\n".join([doc.page_content for doc in documents[:N_PAGE_SUMMARY]])
        summary = summary_extraction(first_n_pages)
        
        # Log
        word_count = 0
        for doc in documents:
            word_count += doc.page_content.count(' ') 
        chunk_count = len(chunks)
        print(f"Processed {file_path}, {word_count} words and {chunk_count} chunks.")
        
        # 4) Save metadata
        format_name = file_name.split(".")[0]
        metadata.append({
            "file_name": file_name,
            "format_name": format_name,
            "file_path": file_path,
            "chunk_count": chunk_count,
            "total_word_count": word_count,
            "summary": summary,
        })

        # 5) Save chunks
        chunks_text_list = [chunk.page_content for chunk in chunks]
        file_chunks_dir = os.path.join(CHUNKS_DIR, f"{format_name}.json")
        
        # Save all chunks into one JSON file
        with open(file_chunks_dir, "w") as f:
            json.dump(chunks_text_list, f, indent=2)

# Save metadata to file
with open(metadata_dir, "w") as f:
    json.dump(metadata, f, indent=2)

Processed finance_data/nvidia.pdf, 69704 words and 167 chunks.
Processed finance_data/cisco.pdf, 71074 words and 175 chunks.
Processed finance_data/oracle.pdf, 91978 words and 227 chunks.
Processed finance_data/meta.pdf, 3107 words and 11 chunks.
Processed finance_data/tesla.pdf, 59382 words and 152 chunks.
Processed finance_data/apple.pdf, 1475 words and 3 chunks.
Processed finance_data/netflix.pdf, 86087 words and 387 chunks.
Processed finance_data/reddit.pdf, 67567 words and 146 chunks.
Processed finance_data/google.pdf, 51919 words and 123 chunks.
Processed finance_data/amazon.pdf, 47161 words and 112 chunks.


In [212]:
res = {'file_name': [], 
       'question': [], 
       'format_name': [], 
       'file_path': [], 
       'summary': [], 
       'chunk': [], 
       'chunk_id': []}

for data in metadata:
    file_name, format_name, file_path, summary = data["file_name"], data["format_name"], data["file_path"], data["summary"]
    file_chunks_dir = os.path.join(CHUNKS_DIR, f"{format_name}.json")
    chunks = json.load(open(file_chunks_dir, "r"))
    for i in range(N_QUESTIONS):
        #Pick a random chunk
        chunk_id = random.randint(0, len(chunks) - 1)
        chunk = chunks[chunk_id]
        question = generate_questions(summary, chunk)
        #Save to results
        res['file_name'].append(file_name)
        res['question'].append(question)
        res['format_name'].append(format_name)
        res['file_path'].append(file_path)
        res['summary'].append(summary)
        res['chunk'].append(chunk)
        res['chunk_id'].append(chunk_id)
res = pd.DataFrame(res)

In [213]:
res.head()

Unnamed: 0,file_name,question,format_name,file_path,summary,chunk,chunk_id
0,nvidia.pdf,How could extended payment term arrangements w...,nvidia,finance_data/nvidia.pdf,The report is about NVIDIA Corporation and cov...,Table of Contents\n•\nour extended payment ter...,40
1,cisco.pdf,What are the key details reported by Cisco Sys...,cisco,finance_data/cisco.pdf,"The report is about Cisco Systems, Inc. and co...",UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,18
2,oracle.pdf,How does Oracle Financial Services Software Li...,oracle,finance_data/oracle.pdf,The report is about Oracle Financial Services ...,and industry standards. The solution enables s...,46
3,meta.pdf,How did the operating income for Meta Platform...,meta,finance_data/meta.pdf,"The report is about Meta Platforms, Inc. and c...",Segment Results\nWe report our financial resul...,9
4,tesla.pdf,"What details are provided in Tesla, Inc.'s rep...",tesla,finance_data/tesla.pdf,"The report is about Tesla, Inc. and covers the...",Exhibit\nNumber\nIncorporated by Reference\nFi...,121


In [214]:
res.to_csv(f"{PROJECT_NAME}/questions.csv", index=False)
res['question']

0    How could extended payment term arrangements w...
1    What are the key details reported by Cisco Sys...
2    How does Oracle Financial Services Software Li...
3    How did the operating income for Meta Platform...
4    What details are provided in Tesla, Inc.'s rep...
5    Based on Apple Inc.'s financial report for the...
6    What specific financial metrics or trends did ...
7    What potential risks does Reddit, Inc. face in...
8    How could changes in tax policies, laws, or ra...
9    What information does Amazon's 2024 report pro...
Name: question, dtype: object

# Create dummy testset

In [215]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

# Settings
PDF_FOLDER = "finance_data"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500

# Load all PDFs
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# Split
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = splitter.split_documents(all_documents)

print(f"Split into {len(chunks)} chunks.")

# Embedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}  # False = Euclidean, True = Cosine similarity

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")
print("Vector store saved.")

# Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# LLM setup
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key="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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

print("QA chain is ready.")


Loaded 1159 total documents.
Split into 1511 chunks.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Vector store saved.
QA chain is ready.


In [216]:
dummy_test = {'question': [], 'answer': [], 'top_k_chunk': []} #Top 3

for question in res['question']:
    result = qa_chain.invoke(question)
    dummy_test['question'].append(question)
    dummy_test['answer'].append(result['result'])
    dummy_test['top_k_chunk'].append(result['source_documents'])
dummy_test = pd.DataFrame(dummy_test)

In [217]:
dummy_test

Unnamed: 0,question,answer,top_k_chunk
0,How could extended payment term arrangements w...,Several factors—including extended payment ter...,[page_content='Table of Contents\nNVIDIA Corpo...
1,What are the key details reported by Cisco Sys...,"Cisco Systems, Inc.'s 2024 Annual Report (Form...",[page_content='1\nCisco Annual Report 2024\n2\...
2,How does Oracle Financial Services Software Li...,Oracle Financial Services Software Limited’s p...,[page_content='Oracle Financial Services Softw...
3,How did the operating income for Meta Platform...,"For the twelve months ended December 31, 2024,...","[page_content='META PLATFORMS, INC.\nCONDENSED..."
4,"What details are provided in Tesla, Inc.'s rep...","Tesla, Inc.’s Annual Report on Form 10-K for t...","[page_content='Tesla, Inc.\nConsolidated State..."
5,Based on Apple Inc.'s financial report for the...,Based on the provided financial data for Apple...,[page_content='Apple Inc. \nCONDENSED CONSOLID...
6,What specific financial metrics or trends did ...,"In its fiscal year ending December 31, 2022, N...","[page_content='Table of Contents\nNETFLIX, INC..."
7,"What potential risks does Reddit, Inc. face in...","Reddit, Inc. faces several significant risks i...",[page_content='We may make decisions regarding...
8,"How could changes in tax policies, laws, or ra...","Changes in tax policies, laws, or rates in var...",[page_content='become liable for taxes that ar...
9,What information does Amazon's 2024 report pro...,Amazon's 2024 annual report provides several k...,"[page_content='AMAZON.COM, INC.\nFORM 10-K\nFo..."


In [232]:
# Combine ground truth and generated answer
combined_df = pd.merge(res, dummy_test, on='question', suffixes=('_ground_truth', '_generated'))

In [233]:
combined_df

Unnamed: 0,file_name,question,format_name,file_path,summary,chunk,chunk_id,answer,top_k_chunk
0,nvidia.pdf,How could extended payment term arrangements w...,nvidia,finance_data/nvidia.pdf,The report is about NVIDIA Corporation and cov...,Table of Contents\n•\nour extended payment ter...,40,Several factors—including extended payment ter...,[page_content='Table of Contents\nNVIDIA Corpo...
1,cisco.pdf,What are the key details reported by Cisco Sys...,cisco,finance_data/cisco.pdf,"The report is about Cisco Systems, Inc. and co...",UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,18,"Cisco Systems, Inc.'s 2024 Annual Report (Form...",[page_content='1\nCisco Annual Report 2024\n2\...
2,oracle.pdf,How does Oracle Financial Services Software Li...,oracle,finance_data/oracle.pdf,The report is about Oracle Financial Services ...,and industry standards. The solution enables s...,46,Oracle Financial Services Software Limited’s p...,[page_content='Oracle Financial Services Softw...
3,meta.pdf,How did the operating income for Meta Platform...,meta,finance_data/meta.pdf,"The report is about Meta Platforms, Inc. and c...",Segment Results\nWe report our financial resul...,9,"For the twelve months ended December 31, 2024,...","[page_content='META PLATFORMS, INC.\nCONDENSED..."
4,tesla.pdf,"What details are provided in Tesla, Inc.'s rep...",tesla,finance_data/tesla.pdf,"The report is about Tesla, Inc. and covers the...",Exhibit\nNumber\nIncorporated by Reference\nFi...,121,"Tesla, Inc.’s Annual Report on Form 10-K for t...","[page_content='Tesla, Inc.\nConsolidated State..."
5,apple.pdf,Based on Apple Inc.'s financial report for the...,apple,finance_data/apple.pdf,The financial report is about Apple Inc. and c...,Apple Inc. \nCONDENSED CONSOLIDATED STATEMENTS...,0,Based on the provided financial data for Apple...,[page_content='Apple Inc. \nCONDENSED CONSOLID...
6,netflix.pdf,What specific financial metrics or trends did ...,netflix,finance_data/netflix.pdf,"The report is about Netflix, Inc. and covers t...","""td"",\n""tr"",\n""table"",\n""div"",\n""body"",\n""html...",257,"In its fiscal year ending December 31, 2022, N...","[page_content='Table of Contents\nNETFLIX, INC..."
7,reddit.pdf,"What potential risks does Reddit, Inc. face in...",reddit,finance_data/reddit.pdf,"The report is about Reddit, Inc. and covers th...",We believe that our corporate culture has cont...,42,"Reddit, Inc. faces several significant risks i...",[page_content='We may make decisions regarding...
8,google.pdf,"How could changes in tax policies, laws, or ra...",google,finance_data/google.pdf,The report is about Alphabet Inc. and covers t...,could harm our financial condition and operati...,38,"Changes in tax policies, laws, or rates in var...",[page_content='become liable for taxes that ar...
9,amazon.pdf,What information does Amazon's 2024 report pro...,amazon,finance_data/amazon.pdf,The report is about Amazon and covers the year...,PART II\nItem 5.\nMarket for the Registrant’s ...,38,Amazon's 2024 annual report provides several k...,"[page_content='AMAZON.COM, INC.\nFORM 10-K\nFo..."


In [234]:
dummy_test.to_csv(f"{PROJECT_NAME}/dummy_test.csv", index=False)

# Evaluation Process

In [235]:
prompt = """
    You are a financial data Q&A evaluator.

    You are given:
    - A **question** generated from a document chunk.
    - The **document chunk** (ground truth source).
    - A **model-generated answer** to the question.

    Your job is to score the model’s answer by carefully comparing it to the document chunk.

    Use the following rubric for each category:

    ---
    **Factual Correctness**
    - 5 = All facts are fully correct and consistent with the chunk.
    - 4 = Minor factual inaccuracies but mostly correct.
    - 3 = Some factual inaccuracies, partly correct.
    - 2 = Major factual mistakes, mostly incorrect.
    - 1 = Completely factually wrong.

    ---
    **Completeness**
    - 5 = Fully answers the question with all key details.
    - 4 = Mostly complete, missing minor details.
    - 3 = Partially complete, missing important parts.
    - 2 = Mostly incomplete, only touches on part of the question.
    - 1 = Completely incomplete.

    ---
    3**Clarity**
    - 5 = Clear, precise, and easy to understand.
    - 4 = Mostly clear, with minor awkwardness.
    - 3 = Understandable but somewhat confusing or vague.
    - 2 = Hard to understand or poorly phrased.
    - 1 = Completely unclear or nonsensical.

    ---
    **Response Format**
    Return ONLY this JSON (no extra explanation):
    {
        "factual_correctness_score": [1-5],
        "completeness_score": [1-5],
        "clarity_score": [1-5],
        "comments": "A brief explanation (1-2 sentences) why you assigned these scores."
    }
"""


def evaluate_answer(question, chunk, answer):
    response = client.chat.completions.create(
        model="sonar",
        messages=[
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": f"""
                Please evaluate the following answer based on the provided question and document chunk. 
                Return ONLY a valid JSON object.

                Question: {question}

                Document Chunk: {chunk}

                Model Answer: {answer}
                """
            }
        ],
    )

    response_content = response.choices[0].message.content.strip()
    print("LLM Raw Output:", response_content)

    # Remove duplicate keys by keeping only the last occurrence
    cleaned_content = re.sub(
        r'(,\s*")(\w+_score)":\s*\d,\s*"\2":\s*\d',
        lambda m: f',{m.group(2)}": {m.group(0).split(":")[-1]}',
        response_content
    )

    result = json.loads(cleaned_content)
    return result


In [236]:
import pandas as pd

# Prepare a list to collect all processed rows
final_rows = []

for _, row in combined_df.iterrows():
    question = row['question']
    chunk = row['chunk']
    answer = row['answer']

    success = False
    while not success:
        try:
            evaluation = evaluate_answer(question, chunk, answer)
            success = True  # Break loop if successful
        except Exception as e:
            print(f"Retrying for question: {question} due to error: {e}")

    # Build a combined result dictionary
    result_row = {
        'question': question,
        'chunk': chunk,
        'answer': answer
    }
    # Add evaluation results
    for key, value in evaluation.items():
        result_row[f'evaluation_{key}'] = value

    final_rows.append(result_row)

# Convert list of results to DataFrame
final_df = pd.DataFrame(final_rows)

# Save to CSV
final_df.to_csv('final.csv', index=False)
print("Saved final results to final.csv")


LLM Raw Output: ```json
{
    "factual_correctness_score": 4,
    "completeness_score": 4,
    "clarity_score": 5,
    "comments": "The model's answer is mostly factually correct, but includes some additional details not present in the document chunk. It is mostly complete but lacks minor details directly from the document chunk. The clarity is high, as the answer is well-structured and easy to understand."
}
```
Retrying for question: How could extended payment term arrangements with customers, vendor payment requirements, unanticipated environmental liabilities, and changes in financial accounting standards potentially impact NVIDIA Corporation’s financial results, as highlighted in the fiscal year ending January 26, 2025 report? due to error: Expecting value: line 1 column 1 (char 0)
LLM Raw Output: {
  "factual_correctness_score": 5,
  "completeness_score": 5,
  "clarity_score": 5,
  "comments": "The answer accurately reflects the information from the document chunk regarding the i

In [239]:
final_df.head()

Unnamed: 0,question,chunk,answer,evaluation_factual_correctness_score,evaluation_completeness_score,evaluation_clarity_score,evaluation_comments
0,How could extended payment term arrangements w...,Table of Contents\n•\nour extended payment ter...,Several factors—including extended payment ter...,5,5,5,The answer accurately reflects the information...
1,What are the key details reported by Cisco Sys...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,"Cisco Systems, Inc.'s 2024 Annual Report (Form...",4,4,5,The model accurately summarizes Cisco's market...
2,How does Oracle Financial Services Software Li...,and industry standards. The solution enables s...,Oracle Financial Services Software Limited’s p...,2,2,3,The answer includes many details about Oracle'...
3,How did the operating income for Meta Platform...,Segment Results\nWe report our financial resul...,"For the twelve months ended December 31, 2024,...",5,5,5,The answer correctly reports the operating inc...
4,"What details are provided in Tesla, Inc.'s rep...",Exhibit\nNumber\nIncorporated by Reference\nFi...,"Tesla, Inc.’s Annual Report on Form 10-K for t...",5,4,5,"The answer is factually correct and clear, but..."


In [245]:
import plotly.express as px
import pandas as pd

# Make sure relevant columns are numeric
numeric_cols = [
    'evaluation_factual_correctness_score', 
    'evaluation_completeness_score', 
    'evaluation_clarity_score'
]

df = final_df.copy()
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN

# Calculate overall average score per question
df['overall_score'] = df[numeric_cols].mean(axis=1)

# Melt DataFrame to long format (including overall score)
melted_df = df.melt(
    id_vars=['question'],
    value_vars=numeric_cols + ['overall_score'],
    var_name='Metric',
    value_name='Score'
)

# Clean up metric names for display
melted_df['Metric'] = (
    melted_df['Metric']
    .str.replace('evaluation_', '', regex=False)
    .str.replace('_score', '', regex=False)
    .str.replace('_', ' ')
    .str.title()
)

# Drop rows with missing scores (optional, if needed)
melted_df = melted_df.dropna(subset=['Score'])

# Plot boxplot
fig = px.box(
    melted_df,
    x='Metric',
    y='Score',
    points='all',  # show individual points
    hover_data=['question'],
    title='Score Distributions per Metric (with Overall Score)',
    height=500
)

fig.update_layout(
    yaxis=dict(range=[0, 6], dtick=1),
    xaxis_title='Metric',
    yaxis_title='Score (1-5)'
)

fig.show()


In [249]:
import plotly.figure_factory as ff

# Select relevant columns
metrics = [
    'evaluation_factual_correctness_score',
    'evaluation_completeness_score',
    'evaluation_clarity_score',
    'overall_score'
]

# Calculate correlation matrix
corr_matrix = df[metrics].corr().round(2)

# Create heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    annotation_text=corr_matrix.values,
    colorscale='Blues',
    showscale=True
)

fig.update_layout(
    title='Correlation Heatmap of Evaluation Metrics',
    width=600,
    height=600
)

fig.show()


In [255]:
#Worst answer by overall
worst_answer = df.loc[df['overall_score'].idxmin()]
print(f"Worst Answer by Overall Score: {df['overall_score'].min()}")
print(f"Question: {worst_answer['question']}")
# print(f"Answer: {worst_answer['answer']}")
print(f"Reasons: {worst_answer['evaluation_comments']}")  

Worst Answer by Overall Score: 2.3333333333333335
Question: How does Oracle Financial Services Software Limited’s portfolio for the financial year ended March 31, 2024, support innovation and efficiency for retail banking clients, as highlighted by its machine learning capabilities and customer-centric digital solutions?
Reasons: The answer includes many details about Oracle's retail banking portfolio and machine learning capabilities, but much of its content is not supported or mentioned in the provided document chunk. The chunk highlights specific solutions like Oracle Banking Origination, FLEXCUBE for Islamic Banking, Oracle Banking Platform, and Oracle Banking Branch with embedded machine learning, but the answer introduces unreferenced cloud services and AI-driven features not cited in the chunk. The original chunk does not mention generative AI, extensive cloud modular services, or specific customer deals, which the answer attributes to the company. While the answer attempts to b