### Model 2

In [1]:
import pandas as pd

In [2]:
import os
import csv
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatPerplexity
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# === Settings ===
PDF_FOLDER = "data"
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 500
VECTOR_STORE_DIR = "chroma_index_finance"
LOG_FILE = "qa_log_2.csv"

# === Load all PDFs ===
all_documents = []
for filename in os.listdir(PDF_FOLDER):
    if filename.endswith(".pdf"):
        file_path = os.path.join(PDF_FOLDER, filename)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.metadata["source"] = filename  # track which PDF it came from
        all_documents.extend(documents)

print(f"Loaded {len(all_documents)} total documents.")

# === Split text into chunks ===
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = text_splitter.split_documents(all_documents)

# Add unique chunk IDs and ensure source is in metadata
for i, doc in enumerate(chunks):
    doc.metadata["chunk_id"] = i
    doc.metadata["source"] = doc.metadata.get("source", "unknown")

# === Embedding model ===
model_name = "BAAI/bge-base-en"
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# === Vector Store ===
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=hf,
    persist_directory=VECTOR_STORE_DIR
)
# vector_store.persist()

# === Prompt Template (only answer output) ===
prompt_template = """
You are a professional financial advisor with expertise in corporate finance, investment analysis, and career development in finance-related roles.

Use only the information provided in the context to answer the user's question.
Do not make assumptions or fabricate any details.

Respond clearly and professionally, as if advising a client on their financial career or investment decisions.

{context}

Question: {question}

If the answer is not explicitly stated in the context, respond with: "I don't know based on the provided document".
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# === Retriever Setup ===
base_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# Perplexity LLM
perplexity_llm = ChatPerplexity(
    model="sonar",
    pplx_api_key="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",
    temperature=0.2
)

# Compression retriever
compressor = LLMChainExtractor.from_llm(perplexity_llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

# === QA Chain ===
qa_chain = RetrievalQA.from_chain_type(
    llm=perplexity_llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

# === Process QA + Save to CSV ===



Loaded 1182 total documents.


  hf = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
  perplexity_llm = ChatPerplexity(


In [3]:
import json
import os
import csv

def process_answer(query):
    response = qa_chain({"query": query})
    
    # Handle case where result is a JSON-formatted string
    try:
        result = json.loads(response["result"])
        answer_text = result.get("answer", response["result"])
    except (json.JSONDecodeError, TypeError):
        # Fallback: use the raw string if not JSON
        answer_text = response["result"]

    source_docs = response['source_documents']

    sources_info = []
    for doc in source_docs:
        chunk_id = doc.metadata.get("chunk_id", "N/A")
        source = doc.metadata.get("source", "N/A")
        sources_info.append({"chunk_id": str(chunk_id), "source": source})

    # Prepare data to log
    top_k_chunks = [src["chunk_id"] for src in sources_info if src["chunk_id"] != "N/A"]
    sources_list = [src["source"] for src in sources_info if src["source"] != "N/A"]

    # Write to CSV
    file_exists = os.path.isfile(LOG_FILE)
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["question", "answer", "sources", "top_k_chunks"])
        if not file_exists:
            writer.writeheader()
        writer.writerow({
            "question": query,
            "answer": answer_text,
            "sources": "; ".join(list(set(sources_list))),
            "top_k_chunks": "; ".join(list(set(top_k_chunks)))
        })


In [None]:
import pandas as pd
questions= pd.read_csv("session_1/questions.csv")

for index, row in questions.iterrows():
    question = row['question']
    process_answer(question) 

  response = qa_chain({"query": query})


In [5]:
df = pd.read_csv("qa_log_2.csv")

In [6]:
df.head(10)

Unnamed: 0,question,answer,sources,top_k_chunks
0,"How could challenges in hiring, retaining, and...","Challenges in hiring, retaining, and managing ...",2024-nvidia-annual-report-10K.pdf; 2024-amazon...,23; 651
1,What was the total lease liability for Apple I...,I don't know based on the provided document. H...,2024-apple-annual-report-10K.pdf,125; 123
2,What are the key competitive challenges Cisco ...,"Based on the provided search results, Cisco fa...",2024-cisco-full-annual-report.pdf,218; 214; 216
3,Based on the Alphabet Inc. fiscal year ended D...,"For Alphabet Inc., the basic and diluted net i...",2024-google-annual-report-10K.pdf,426; 396; 372
4,What were the key financial results reported b...,"Based strictly on the provided context, here i...",2024-meta-full-annual-report.pdf,449; 448; 442
5,What operational and strategic challenges did ...,"Netflix, Inc. identified several operational a...",2024-netflix-annual-report-10K.pdf,492; 495; 472
6,"Question: \nWhere, as required by law, does N...",I don't know based on the provided document wh...,2024-nvidia-annual-report-10K.pdf,682; 627
7,What highlights or key financial metrics are i...,"Based on the provided context, specific key fi...",2024-oracle-annual-report-10K.pdf,717; 837; 835
8,"How does Reddit, Inc. plan to manage the poten...",I don't know based on the provided document.\n...,2024-reddit-annual-report-10K.pdf,981; 984; 929
9,"Which Tesla, Inc. executives signed the compan...","The executives who signed Tesla, Inc.'s annual...",2024-tsla-annual-report-10K.pdf,1091; 1019; 1172


In [7]:
from openai import OpenAI
import json, random, re

API_KEY = "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"    
#INITIALIZATION
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")

In [8]:
prompt = """
    You are a financial data Q&A evaluator.

    You are given:
    - A **question** generated from a document chunk.
    - The **document chunk** (ground truth source).
    - A **model-generated answer** to the question.

    Your job is to score the model’s answer by carefully comparing it to the document chunk.

    Use the following rubric for each category:

    ---
    **Factual Correctness**
    - 5 = All facts are fully correct and consistent with the chunk.
    - 4 = Minor factual inaccuracies but mostly correct.
    - 3 = Some factual inaccuracies, partly correct.
    - 2 = Major factual mistakes, mostly incorrect.
    - 1 = Completely factually wrong.

    ---
    **Completeness**
    - 5 = Fully answers the question with all key details.
    - 4 = Mostly complete, missing minor details.
    - 3 = Partially complete, missing important parts.
    - 2 = Mostly incomplete, only touches on part of the question.
    - 1 = Completely incomplete.

    ---
    3**Clarity**
    - 5 = Clear, precise, and easy to understand.
    - 4 = Mostly clear, with minor awkwardness.
    - 3 = Understandable but somewhat confusing or vague.
    - 2 = Hard to understand or poorly phrased.
    - 1 = Completely unclear or nonsensical.

    ---
    **Response Format**
    Return ONLY this JSON (no extra explanation):
    {
        "factual_correctness_score": [1-5],
        "completeness_score": [1-5],
        "clarity_score": [1-5],
        "comments": "A brief explanation (1-2 sentences) why you assigned these scores."
    }
"""


def evaluate_answer(question, chunk, answer):
    response = client.chat.completions.create(
        model="sonar",
        messages=[
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": f"""
                Please evaluate the following answer based on the provided question and document chunk. 
                Return ONLY a valid JSON object.

                Question: {question}

                Document Chunk: {chunk}

                Model Answer: {answer}
                """
            }
        ],
    )

    response_content = response.choices[0].message.content.strip()
    print("LLM Raw Output:", response_content)

    # Remove duplicate keys by keeping only the last occurrence
    cleaned_content = re.sub(
        r'(,\s*")(\w+_score)":\s*\d,\s*"\2":\s*\d',
        lambda m: f',{m.group(2)}": {m.group(0).split(":")[-1]}',
        response_content
    )

    result = json.loads(cleaned_content)
    return result


In [9]:
import pandas as pd

# Prepare a list to collect all processed rows
final_rows = []

for _, row in df.iterrows():
    question = row['question']
    top_k_chunk = row['top_k_chunks']
    answer = row['answer']

    success = False
    while not success:
        try:
            evaluation = evaluate_answer(question, top_k_chunk, answer)
            success = True  # Break loop if successful
        except Exception as e:
            print(f"Retrying for question: {question} due to error: {e}")

    # Build a combined result dictionary
    result_row = {
        'question': question,
        'top_k_chunk': top_k_chunk,
        'answer': answer
    }
    # Add evaluation results
    for key, value in evaluation.items():
        result_row[f'evaluation_{key}'] = value

    final_rows.append(result_row)

# Convert list of results to DataFrame
final_df = pd.DataFrame(final_rows)

# Save to CSV
final_df.to_csv('final.csv', index=False)
print("Saved final results to final.csv")


LLM Raw Output: ```json
{
    "factual_correctness_score": 4,
    "completeness_score": 4,
    "clarity_score": 5,
    "comments": "The model’s answer is mostly factually correct and provides a comprehensive overview of the challenges. However, it lacks specific details from the document chunk, such as automation and return-to-office mandates affecting Amazon. The answer is clear and well-structured."
}
```
Retrying for question: How could challenges in hiring, retaining, and managing qualified personnel, as well as risks related to supplier relationships and commercial agreements, negatively impact Amazon’s business operations and results in 2024? due to error: Expecting value: line 1 column 1 (char 0)
LLM Raw Output: {
  "factual_correctness_score": 5,
  "completeness_score": 5,
  "clarity_score": 5,
  "comments": "The answer thoroughly covers how challenges in personnel management and supplier/commercial risks could negatively impact Amazon's 2024 operations, fully reflecting the de

In [10]:
final_df = pd.read_csv("final.csv")
final_df

Unnamed: 0,question,top_k_chunk,answer,evaluation_factual_correctness_score,evaluation_completeness_score,evaluation_clarity_score,evaluation_comments
0,"How could challenges in hiring, retaining, and...",23; 651,"Challenges in hiring, retaining, and managing ...",5,5,5,The answer thoroughly covers how challenges in...
1,What was the total lease liability for Apple I...,125; 123,I don't know based on the provided document. H...,3,3,4,The answer incorrectly states the $15.8 billio...
2,What are the key competitive challenges Cisco ...,218; 214; 216,"Based on the provided search results, Cisco fa...",5,4,5,The answer accurately summarizes key competiti...
3,Based on the Alphabet Inc. fiscal year ended D...,426; 396; 372,"For Alphabet Inc., the basic and diluted net i...",1,2,4,The answer contains completely incorrect EPS v...
4,What were the key financial results reported b...,449; 448; 442,"Based strictly on the provided context, here i...",5,4,5,The answer is factually correct regarding net ...
5,What operational and strategic challenges did ...,492; 495; 472,"Netflix, Inc. identified several operational a...",5,5,5,The answer accurately captures Netflix's opera...
6,"Question: \nWhere, as required by law, does N...",682; 627,I don't know based on the provided document wh...,4,3,5,The answer correctly states that the document ...
7,What highlights or key financial metrics are i...,717; 837; 835,"Based on the provided context, specific key fi...",2,3,5,The answer lists specific financial metrics bu...
8,"How does Reddit, Inc. plan to manage the poten...",981; 984; 929,I don't know based on the provided document.\n...,1,1,5,The model answer states it does not know based...
9,"Which Tesla, Inc. executives signed the compan...",1091; 1019; 1172,"The executives who signed Tesla, Inc.'s annual...",3,2,4,The answer correctly lists signers and their t...


In [11]:
import plotly.express as px
import pandas as pd

# Make sure relevant columns are numeric
numeric_cols = [
    'evaluation_factual_correctness_score', 
    'evaluation_completeness_score', 
    'evaluation_clarity_score'
]

df = final_df.copy()
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN

# Calculate overall average score per question
df['overall_score'] = df[numeric_cols].mean(axis=1)

# Melt DataFrame to long format (including overall score)
melted_df = df.melt(
    id_vars=['question'],
    value_vars=numeric_cols + ['overall_score'],
    var_name='Metric',
    value_name='Score'
)

# Clean up metric names for display
melted_df['Metric'] = (
    melted_df['Metric']
    .str.replace('evaluation_', '', regex=False)
    .str.replace('_score', '', regex=False)
    .str.replace('_', ' ')
    .str.title()
)

# Drop rows with missing scores (optional, if needed)
melted_df = melted_df.dropna(subset=['Score'])

# Plot boxplot
fig = px.box(
    melted_df,
    x='Metric',
    y='Score',
    points='all',  # show individual points
    hover_data=['question'],
    title='Score Distributions per Metric (with Overall Score)',
    height=500
)

fig.update_layout(
    yaxis=dict(range=[0, 6], dtick=1),
    xaxis_title='Metric',
    yaxis_title='Score (1-5)'
)

fig.show()

In [12]:
import plotly.figure_factory as ff

# Select relevant columns
metrics = [
    'evaluation_factual_correctness_score',
    'evaluation_completeness_score',
    'evaluation_clarity_score',
    'overall_score'
]

# Calculate correlation matrix
corr_matrix = df[metrics].corr().round(2)

# Create heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    annotation_text=corr_matrix.values,
    colorscale='Blues',
    showscale=True
)

fig.update_layout(
    title='Correlation Heatmap of Evaluation Metrics',
    width=600,
    height=600
)

fig.show()


In [13]:
#Worst answer by overall
worst_answer = df.loc[df['overall_score'].idxmin()]
print(f"Worst Answer by Overall Score: {df['overall_score'].min()}")
print(f"Question: {worst_answer['question']}")
# print(f"Answer: {worst_answer['answer']}")
print(f"Reasons: {worst_answer['evaluation_comments']}")

Worst Answer by Overall Score: 2.3333333333333335
Question: Based on the Alphabet Inc. fiscal year ended December 31, 2024, what were the basic and diluted net income per share amounts for Class A, Class B, and Class C shares, and how did these compare to the previous year ended December 31, 2023?
Reasons: The answer contains completely incorrect EPS values and does not reflect the actual EPS figures from Alphabet's 2023 and 2024 financials; also, the document chunk does not support the breakdown by share class for basic and diluted EPS.
