# RAG System — Apple & Tesla 10-K

Built a RAG pipeline to answer questions from Apple's 2024
and Tesla's 2023 annual SEC filings.

PDFs used:
- Apple 10-K (fiscal year ended Sep 28, 2024)
- Tesla 10-K (fiscal year ended Dec 31, 2023)

In [2]:
# Installing all the libraries
!pip install -q pymupdf
!pip install -q langchain langchain-community langchain-openai
!pip install -q langchain-groq langchain-cohere langchain-chroma
!pip install -q rank_bm25 chromadb

print("All libraries installed!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m


## Step 1: Setup
Installing dependencies and setting API keys for OpenAI
(embeddings), Groq (LLM) and Cohere (reranker).

In [4]:
# System tools + API keys
!apt-get install -y poppler-utils tesseract-ocr -q

import os
os.environ["OPENAI_API_KEY"] = "use_your_key"
os.environ["GROQ_API_KEY"]   = "use_your_key"
os.environ["COHERE_API_KEY"] = "use_your_key"

keys = ["OPENAI_API_KEY", "GROQ_API_KEY", "COHERE_API_KEY"]
for key in keys:
    value = os.environ.get(key, "")
    status = "Set" if value and not value.startswith("your-") else "Missing"
    print(f"{key}: {status}")

Reading package lists...
Building dependency tree...
Reading state information...
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
OPENAI_API_KEY: Set
GROQ_API_KEY: Set
COHERE_API_KEY: Set


## Step 2: Imports
Loading all required libraries.

In [5]:
# All imports
import os, json, fitz, time
from typing import List

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever
from langchain_groq import ChatGroq
from langchain_cohere import CohereRerank
from langchain_text_splitters import RecursiveCharacterTextSplitter

import langchain_core, langchain_groq
print("langchain_core:", langchain_core.__version__)
print("langchain_groq:", langchain_groq.__version__)
print("All imports successful!")

langchain_core: 1.2.13
langchain_groq: 1.1.2
All imports successful!


## Step 3: Document Ingestion
Downloading both PDFs directly from SEC URLs and parsing
them page by page using PyMuPDF. Page numbers are captured
here for citations later.

In [6]:
# Download both 10-K PDFs directly from SEC
print("Downloading Apple 10-K...")
!wget -q "https://s2.q4cdn.com/470004039/files/doc_earnings/2024/q4/filing/10-Q4-2024-As-Filed.pdf" -O "apple_10k.pdf"
print("Apple 10-K downloaded!")

print("Downloading Tesla 10-K...")
!wget -q "https://ir.tesla.com/_flysystem/s3/sec/000162828024002390/tsla-20231231-gen.pdf" -O "tesla_10k.pdf"
print("Tesla 10-K downloaded!")

def parse_pdf(file_path, doc_name):
    """Parse PDF page by page using PyMuPDF"""
    print(f"\nParsing {doc_name}...")
    doc   = fitz.open(file_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        if text.strip():
            pages.append({
                "text"  : text,
                "page"  : page_num + 1,
                "source": doc_name
            })

    doc.close()
    print(f"Done! {len(pages)} pages extracted from {doc_name}")
    return pages

# Parse both documents
apple_pages = parse_pdf("apple_10k.pdf", "Apple 10-K")
tesla_pages = parse_pdf("tesla_10k.pdf", "Tesla 10-K")
all_pages   = apple_pages + tesla_pages

print(f"\nTotal pages extracted: {len(all_pages)}")

Downloading Apple 10-K...
Apple 10-K downloaded!
Downloading Tesla 10-K...
Tesla 10-K downloaded!

Parsing Apple 10-K...
Done! 121 pages extracted from Apple 10-K

Parsing Tesla 10-K...
Done! 130 pages extracted from Tesla 10-K

Total pages extracted: 251


## Step 4: Chunking
Splitting pages into 1000 character chunks with 200 character
overlap so context isn't lost at boundaries.

In [7]:
# Split pages into overlapping chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

documents = []
for page in all_pages:
    chunks = splitter.split_text(page["text"])
    for chunk in chunks:
        documents.append(Document(
            page_content=chunk,
            metadata={
                "source": page["source"],
                "page"  : page["page"]
            }
        ))

print(f"Total chunks created: {len(documents)}")
print(f"\nSample chunk:")
print(f"  Source : {documents[10].metadata['source']}")
print(f"  Page   : {documents[10].metadata['page']}")
print(f"  Content: {documents[10].page_content[:200]}...")

Total chunks created: 1134

Sample chunk:
  Source : Apple 10-K
  Page   : 4
  Content: particular years, quarters, months or periods refer to the Company’s fiscal years ended in September and the associated 
quarters, months and periods of those fiscal years. Each of the terms the “Comp...


## Step 5: Vector Store
Embedding all chunks using OpenAI text-embedding-3-small
and storing in ChromaDB. Loads from disk if already built.

In [8]:
# Create or load vector store
if os.path.exists("db/chroma_db"):
    print("Loading existing vector store from disk...")
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = Chroma(
        persist_directory="db/chroma_db",
        embedding_function=embedding_model,
        collection_metadata={"hnsw:space": "cosine"}
    )
    print(f"Loaded! {vectorstore._collection.count()} documents")
else:
    print("Creating new vector store...")
    print("This will take 3-5 minutes...")
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory="db/chroma_db",
        collection_metadata={"hnsw:space": "cosine"}
    )
    print(f"Done! {vectorstore._collection.count()} documents stored")

Creating new vector store...
This will take 3-5 minutes...
Done! 1134 documents stored


## Step 6: Retrieval Pipeline
Using hybrid search BM25 for exact keyword matches and
vector search for semantic similarity. Cohere reranker
then picks the top 5 most relevant chunks.

In [9]:
# Set up retrieval pipeline and LLM

# 1. Vector retriever — semantic search
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

# 2. BM25 retriever — keyword search
bm25_retriever   = BM25Retriever.from_documents(documents)
bm25_retriever.k = 15

# 3. Hybrid retriever — combines both
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]
)

# 4. Cohere reranker — re-scores top results
reranker = CohereRerank(model="rerank-english-v3.0", top_n=5)

# 5. Groq LLM — Llama 3.3 70B
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

print("Vector retriever  : Ready")
print("BM25 retriever    : Ready")
print("Hybrid retriever  : Ready")
print("Cohere reranker   : Ready")
print("Groq Llama 3      : Ready")
print("\nAll components ready!")

Vector retriever  : Ready
BM25 retriever    : Ready
Hybrid retriever  : Ready
Cohere reranker   : Ready
Groq Llama 3      : Ready

All components ready!


## Step 7: RAG Pipeline
The answer_question() function ties everything together.
Takes a question, retrieves context, and generates a cited
answer using Groq Llama.

In [10]:
# The core RAG function

def build_prompt(query, chunks):
    """Build citation-enforcing prompt"""
    context = ""
    for i, chunk in enumerate(chunks):
        source = chunk.metadata.get("source", "Unknown")
        page   = chunk.metadata.get("page", "?")
        context += f"\n--- Chunk {i+1} | Source: {source} | Page: {page} ---\n"
        context += chunk.page_content + "\n"

    prompt = f"""You are a financial document analyst. Answer ONLY using the context below.

STRICT RULES:
1. Only use information from the provided context chunks.
2. Always cite your source as a list like: ["Apple 10-K", "Item 8", "p. 28"]
3. If the answer is not in the context, respond with exactly this JSON:
   {{"answer": "Not specified in the document.", "sources": []}}
4. If the question is about anything outside Apple or Tesla 10-K filings
   (e.g. stock price forecasts, current roles as of 2025, building colors),
   respond with exactly this JSON:
   {{"answer": "This question cannot be answered based on the provided documents.", "sources": []}}
5. Never use outside knowledge or make up numbers.

CONTEXT:
{context}

QUESTION: {query}

Respond ONLY with this JSON format, nothing else:
{{
  "answer": "your answer here",
  "sources": ["document name", "section if known", "p. PAGE"]
}}"""
    return prompt


def answer_question(query: str) -> dict:
    """
    Answers a question using the RAG pipeline.
    Args:
        query (str): The user question about Apple or Tesla 10-K filings.
    Returns:
        dict: {
            "answer": "Answer text or refusal message",
            "sources": ["Apple 10-K", "Item 8", "p. 28"]
        }
    """
    try:
        # Step 1: Hybrid retrieval
        raw_chunks      = hybrid_retriever.invoke(query)

        # Step 2: Rerank
        reranked_chunks = reranker.compress_documents(raw_chunks, query)

        # Step 3: Build prompt and call LLM
        prompt   = build_prompt(query, reranked_chunks)
        response = llm.invoke([HumanMessage(content=prompt)])
        raw_text = response.content.strip()

        # Step 4: Clean markdown code blocks if present
        if "```" in raw_text:
            raw_text = raw_text.split("```")[1]
            if raw_text.startswith("json"):
                raw_text = raw_text[4:]

        # Step 5: Fix smart quotes that break JSON parsing
        raw_text = raw_text.replace("\u2018", "'").replace("\u2019", "'")
        raw_text = raw_text.replace("\u201c", '"').replace("\u201d", '"')

        return json.loads(raw_text)

    except Exception as e:
        return {"answer": f"Error: {str(e)}", "sources": []}


# Quick test
print("Testing answer_question()...\n")
result = answer_question("What was Apple's total revenue for the fiscal year ended September 28, 2024?")
print(json.dumps(result, indent=2))

Testing answer_question()...

{
  "answer": "$391,035",
  "sources": [
    "Apple 10-K",
    "Item 8",
    "p. 32"
  ]
}


## Step 8: Results
Running all 13 assignment questions. 7 second delay between
calls to stay within Cohere free tier limits.
Questions 11, 12, 13 are intentionally unanswerable from
the documents, testing out-of-scope refusal.

In [12]:
# Run all 13 assignment questions
questions = [
    {"question_id": 1,  "question": "What was Apple's total revenue for the fiscal year ended September 28, 2024?"},
    {"question_id": 2,  "question": "How many shares of common stock were issued and outstanding as of October 18, 2024?"},
    {"question_id": 3,  "question": "What is the total amount of term debt (current + non-current) reported by Apple as of September 28, 2024?"},
    {"question_id": 4,  "question": "On what date was Apple's 10-K report for 2024 signed and filed with the SEC?"},
    {"question_id": 5,  "question": "Does Apple have any unresolved staff comments from the SEC as of this filing? How do you know?"},
    {"question_id": 6,  "question": "What was Tesla's total revenue for the year ended December 31, 2023?"},
    {"question_id": 7,  "question": "What percentage of Tesla's total revenue in 2023 came from Automotive Sales (excluding Leasing)?"},
    {"question_id": 8,  "question": "What is the primary reason Tesla states for being highly dependent on Elon Musk? Answer in one sentence. Do not use apostrophes or special characters."},
    {"question_id": 9,  "question": "What types of vehicles does Tesla currently produce and deliver?"},
    {"question_id": 10, "question": "What is the purpose of Tesla's lease pass-through fund arrangements?"},
    {"question_id": 11, "question": "What is Tesla's stock price forecast for 2025?"},
    {"question_id": 12, "question": "Who is the CFO of Apple as of 2025?"},
    {"question_id": 13, "question": "What color is Tesla's headquarters painted?"}
]

final_answers = []

for item in questions:
    print(f"Answering Q{item['question_id']}...")
    result = answer_question(item["question"])
    final_answers.append({
        "question_id": item["question_id"],
        "answer"     : result.get("answer", "Error"),
        "sources"    : result.get("sources", [])
    })
    time.sleep(7)

print("\n" + "="*60)
print("FINAL ANSWERS:")
print("="*60)
print(json.dumps(final_answers, indent=2))

Answering Q1...
Answering Q2...
Answering Q3...
Answering Q4...
Answering Q5...
Answering Q6...
Answering Q7...
Answering Q8...
Answering Q9...
Answering Q10...
Answering Q11...
Answering Q12...
Answering Q13...

FINAL ANSWERS:
[
  {
    "question_id": 1,
    "answer": "$391,035",
    "sources": [
      "Apple 10-K",
      "Item 8",
      "p. 32"
    ]
  },
  {
    "question_id": 2,
    "answer": "15,115,823,000",
    "sources": [
      "Apple 10-K",
      "Page 2"
    ]
  },
  {
    "question_id": 3,
    "answer": "$96,662",
    "sources": [
      "Apple 10-K",
      "Item 8",
      "p. 34"
    ]
  },
  {
    "question_id": 4,
    "answer": "November 1, 2024",
    "sources": [
      "Apple 10-K",
      "Page 60",
      "p. 60"
    ]
  },
  {
    "question_id": 5,
    "answer": "No, as of this filing, there is an Item 1B. Unresolved Staff Comments, but the page for this item is 17 and there is no information in the provided context about any comments.",
    "sources": [
      "Apple 10