In [1]:
%pip install -q llama-index deeplake langchain cohere dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pickle
from tqdm import tqdm

In [3]:
import os
from dotenv import load_dotenv


load_dotenv("../.env")
assert os.getenv("OPENAI_API_KEY")
assert os.getenv("ACTIVELOOP_TOKEN")

In [4]:
# Uncomment the filenames if you don't want to process the pages using your API key.
preprocessed_texts = "data/financial_analysis/categorized_elements.pkl"
preprocessed_graphs = "data/financial_analysis/graphs_description.pkl"

# Extract Texts/Tables using Unstructured_IO

In [8]:
!curl "https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update.pdf" -o "data/financial_analysis/TSLA-Q1-2023-Update.pdf"
!curl "https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q2-2023-Update.pdf" -o "data/financial_analysis/TSLA-Q2-2023-Update.pdf"
!curl "https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q3-2023-Update-3.pdf" -o "data/financial_analysis/TSLA-Q3-2023-Update-3.pdf"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5796k  100 5796k    0     0  6318k      0 --:--:-- --:--:-- --:--:-- 6314k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6511k  100 6511k    0     0  7761k      0 --:--:-- --:--:-- --:--:-- 7761k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3374k  100 3374k    0     0  6382k      0 --:--:-- --:--:-- --:--:-- 6379k


In [5]:
# Install system packages via Homebrew
!brew install poppler          # provides pdf utilities similar to poppler-utils
!brew install tesseract        # OCR engine

# Install Python packages via pip
%pip install tensorflow-probability imageio pillow

# Install additional Python packages
%pip install "unstructured[all-docs]" fastapi kaleido uvicorn typing-extensions pydantic


To reinstall 25.09.1, run:
  brew reinstall poppler
To reinstall 5.5.1, run:
  brew reinstall tesseract
Collecting tensorflow-probability
  Using cached tensorflow_probability-0.25.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting imageio
  Using cached imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting absl-py (from tensorflow-probability)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting cloudpickle>=1.3 (from tensorflow-probability)
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting gast>=0.3.2 (from tensorflow-probability)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting dm-tree (from tensorflow-probability)
  Using cached dm_tree-0.1.9-cp312-cp312-macosx_10_13_universal2.whl.metadata (2.4 kB)
Using cached tensorflow_probability-0.25.0-py2.py3-none-any.whl (7.0 MB)
Using cached imageio-2.37.0-py3-none-any.whl (315 kB)
Using cached cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Using cached gast-0.6.0

In [6]:
from pydantic import BaseModel
from typing import Any


# Define data structure
class Element(BaseModel):
    source_document: str
    type: str
    text: Any

In [7]:
prefix = "data/financial_analysis"
files = ["TSLA-Q1-2023-Update.pdf", "TSLA-Q2-2023-Update.pdf", "TSLA-Q3-2023-Update-3.pdf"]

In [8]:
if not preprocessed_texts:
  print("Processing the PDF and extracting text/tables...")
  from unstructured.partition.pdf import partition_pdf

  raw_pdf_elements = []

  for quarter in files:
    quater = os.path.join(prefix, quater)
    raw_pdf_elements.append(partition_pdf(
        filename=quarter,
        # Using pdf format to find embedded image blocks
        extract_images_in_pdf=False,
        # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
        # Titles are any sub-section of the document
        infer_table_structure=True,
        # Post processing to aggregate text once we have the title
        chunking_strategy="by_title",
        # Chunking params to aggregate text blocks
        # Attempt to create a new chunk 3800 chars
        # Attempt to keep chunks > 2000 chars
        # Hard max on chunks
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000
    )
  )

  category_counts = {}
  for quarter in raw_pdf_elements:
    for element in quarter:
        category = str(type(element))
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1
  # Unique_categories will have unique elements
  unique_categories = set(category_counts.keys())
  print(category_counts)

  # Categorize by type
  categorized_elements = []
  for idx, quarter in enumerate(raw_pdf_elements):
    for element in quarter:
        if "unstructured.documents.elements.Table" in str(type(element)):
            categorized_elements.append(Element(type="table", text=str(element), source_document=f"2023 Quarter {idx+1} report"))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            categorized_elements.append(Element(type="text", text=str(element), source_document=f"2023 Quarter {idx+1} report"))

  with open("categorized_elements.pkl", "wb") as handle:
    pickle.dump(categorized_elements, handle)

else:
  print("Load the pre-proocessed text/tables...")
  with open(preprocessed_texts, "rb") as file:
      categorized_elements = pickle.load(file)

Load the pre-proocessed text/tables...


In [9]:
len(categorized_elements)

80

# Convert PDF to Images

In [10]:
%pip install -q pdf2image


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
from pdf2image import convert_from_path

In [11]:
convertors = []
for quarter in files:
  quarter = os.path.join(prefix, quarter)
  convertors.append(convert_from_path(quarter))

In [21]:
import os


os.makedirs(os.path.join(prefix, "pages/Q1"))
os.makedirs(os.path.join(prefix, "pages/Q2"))
os.makedirs(os.path.join(prefix, "pages/Q3"))

In [12]:
for quarter, convertor in enumerate(convertors):
  for page_num, image in enumerate(convertor):
      image.save(os.path.join(prefix, f"pages/Q{quarter+1}/page-{page_num+1}.png"))

In [12]:
pages_png = []
for dir in [
  os.path.join(prefix, quarter_images_dir)
  for quarter_images_dir in
  ["pages/Q1", "pages/Q2", "pages/Q3"]
]:
  tmp = []
  for file in os.listdir(dir):
    if file.endswith(".png"):
      file_path = os.path.join(dir, file)
      tmp.append(file_path)
  pages_png.append(tmp)

# Captioning using GPT-4V

In [13]:
import json
import copy
import base64
import requests


headers = {
  "Content-Type": "application/json",
  "Authorization": "Bearer " + str(os.environ["OPENAI_API_KEY"])
}

payload = {
  "model": "gpt-4-vision-preview",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "You are an assistant that find charts, graphs, or diagrams from an image and summarize their information. There could be multiple diagrams in one image, so explain each one of them separately. Ignore tables."
        },
        {
          "type": "text",
          "text": 'The response must be a JSON in following format {"graphs": [<chart_1>, <chart_2>, <chart_3>]} where <chart_1>, <chart_2>, and <chart_3> placeholders that describe each graph found in the image. Do not append or add anything other than the JSON format response.'
        },
        {
          "type": "text",
          "text": 'If could not find a graph in the image, return an empty list JSON as follows: {"graphs": []}. Do not append or add anything other than the JSON format response. Dont use coding "```" marks or the word json.'
        },
        {
          "type": "text",
          "text": "Look at the attached image and describe all the graphs inside it in JSON format. Ignore tables and be concise."
        }
      ]
    }
  ],
  "max_tokens": 1000
}

In [14]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [15]:
if not preprocessed_graphs:
  print("Processing the PDF and extracting graph descriptions...")
  graphs_description = []
  for quarter, page_png in enumerate(pages_png):
    tmp = []
    for page in tqdm(page_png):
      # Getting the base64 string
      base64_image = encode_image(page)

      # Adjust Payload
      tmp_payload = copy.deepcopy(payload)
      tmp_payload["messages"][0]["content"].append({
        "type": "image_url",
        "image_url": {
          "url": f"data:image/png;base64,{base64_image}"
        }
      })

      try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=tmp_payload)
        response = response.json()
        graph_data = json.loads(response["choices"][0]["message"]["content"])["graphs"]
        desc = [f"{page}\n" + "\n".join(f"{key}: {item[key]}" for key in item.keys()) for item in graph_data]
        tmp.extend(desc)

      except:
        # Skip the page if there is an error.
        print("Skipping... error in decoding.")
        continue

    graphs_description.append([Element(type="graph", text=str(item), source_document=f"2023 Quarter {quarter+1} report") for item in tmp])

  with open("graphs_description.pkl", "wb") as handle:
    pickle.dump(graphs_description, handle)

else:
  print("Load the pre-proocessed graph descriptions...")
  with open(preprocessed_graphs, "rb") as file:
      graphs_description = pickle.load(file)

Load the pre-proocessed graph descriptions...


In [16]:
graphs_description = [item for sublist in graphs_description for item in sublist]

In [17]:
len(categorized_elements), len(graphs_description)

(80, 40)

# Merge Text and Table with Images

In [18]:
all_docs = categorized_elements + graphs_description

In [19]:
len(all_docs)

120

# Store on DeepLake

In [22]:
from llama_index.core import Document

In [23]:
documents = [Document(text=t.text, metadata={"category": t.type, "source_document": t.source_document}) for t in categorized_elements]

In [26]:
%pip install llama-index-vector-stores-deeplake

Collecting llama-index-vector-stores-deeplake
  Using cached llama_index_vector_stores_deeplake-0.4.0-py3-none-any.whl.metadata (446 bytes)
Collecting pyjwt (from llama-index-vector-stores-deeplake)
  Using cached PyJWT-2.10.1-py3-none-any.whl.metadata (4.0 kB)
Using cached llama_index_vector_stores_deeplake-0.4.0-py3-none-any.whl (8.7 kB)
Using cached PyJWT-2.10.1-py3-none-any.whl (22 kB)
Installing collected packages: pyjwt, llama-index-vector-stores-deeplake
Successfully installed llama-index-vector-stores-deeplake-0.4.0 pyjwt-2.10.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex


# Create an index over the documents - excluding graph data
vector_store = DeepLakeVectorStore(
    dataset_path="hub://yaroslava/tesla_quarterly_2023-nographs",
    runtime={"tensor_db": True},
    overwrite=False
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

# Inference (No Graph)

In [31]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

In [32]:
vector_store = DeepLakeVectorStore(dataset_path="hub://yaroslava/tesla_quarterly_2023-nographs", overwrite=False)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

[S3] Failed to get bucket region for URL: snark-hub/protected/yaroslava/tesla_quarterly_2023-nographs/ with error: [S3] INVALID_ACCESS_KEY_ID snark-hub The AWS Access Key Id you provided does not exist in our records. 


In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query(
    "What are the trends in vehicle deliveries on quarter 3?"
)

In [None]:
response.response

'Vehicle deliveries in quarter 3 showed a decrease compared to the previous quarter, with Model S/X deliveries declining by 16% and Model 3/Y deliveries decreasing by 6%. Overall, total deliveries in quarter 3 were down by 7% compared to quarter 2.'

![image](attachment:image.png)

# Create Dataset With Graphs

In [None]:
all_docs = [Document(text=t.text, metadata={"category": t.type, "source_document": t.source_document}) for t in all_docs]
vector_store = DeepLakeVectorStore(
    dataset_path="hub://yaroslava/tesla_quarterly_2023",
    runtime={"tensor_db": True},
    overwrite=False
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    all_docs, storage_context=storage_context
)

[S3] Failed to get bucket region for URL: snark-hub/protected/yaroslava/tesla_quarterly_2023/ with error: [S3] INVALID_ACCESS_KEY_ID snark-hub The AWS Access Key Id you provided does not exist in our records. 
[S3] Failed to get bucket region for URL: snark-hub/protected/yaroslava/tesla_quarterly_2023/ with error: [S3] INVALID_ACCESS_KEY_ID snark-hub The AWS Access Key Id you provided does not exist in our records. 


## Inference (With Graph)

In [40]:
vector_store = DeepLakeVectorStore(dataset_path="hub://yaroslava/tesla_quarterly_2023", overwrite=False)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

[S3] Failed to get bucket region for URL: snark-hub/protected/yaroslava/tesla_quarterly_2023/ with error: [S3] INVALID_ACCESS_KEY_ID snark-hub The AWS Access Key Id you provided does not exist in our records. 


In [41]:
query_engine = index.as_query_engine()

In [42]:
response = query_engine.query(
    "What are the trends in vehicle deliveries on quarter 3 report?"
)

In [43]:
response.response

'The trends in vehicle deliveries on the quarter 3 report show a consistent increase in the number of units delivered over the quarters from Q4 2020 to Q3 2023.'

# Environment Snapshot

In [26]:
%pip list

Package                                 Version
--------------------------------------- ---------------
absl-py                                 2.3.1
accelerate                              1.10.1
aiofiles                                24.1.0
aiohappyeyeballs                        2.6.1
aiohttp                                 3.12.15
aiosignal                               1.4.0
aiosqlite                               0.21.0
annotated-types                         0.7.0
antlr4-python3-runtime                  4.9.3
anyio                                   4.10.0
appnope                                 0.1.4
asttokens                               3.0.0
attrs                                   25.3.0
backoff                                 2.2.1
banks                                   2.2.0
beautifulsoup4                          4.13.5
cachetools                              5.5.2
certifi                                 2025.8.3
cffi                                    1.17.1
charset-no