<a href="https://colab.research.google.com/github/xmond/lemon/blob/master/sec_filing_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and store SEC filing in vector DB

In [None]:
!pip install -U -q langchain openai chromadb unstructured==0.12.5 instructor tiktoken

# Download 10-K from SEC

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'your-org your@org.com'})
documents = loader.load()

# Chunk and store 10-K in vector DB

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

# Naively chunk the SEC filing by tokens
token_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=20)
docs = token_splitter.split_documents(documents)

In [None]:
# Save the chunked docs in vector DB
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings(model="text-embedding-3-large"))

# Query vector DB for contextual documents

In [None]:
query = "What was Airbnb's revenue, net income, and cost of revenue?"

In [None]:
# Get documents from the vector DB
k = 1
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])

In [None]:
context

' $ 20,645\n\nThe accompanying notes are an integral part of these consolidated financial statements.\n\n67\n\nTable of Contents\n\nAirbnb, Inc.\n\nConsolidated Statements of Operations\n\n(in millions, except per share amounts)\n\nYear Ended December 31, 2021 2022 2023 Revenue $ 5,992 $ 8,399 $ 9,917 Costs and expenses: Cost of revenue 1,156 1,499 1,703 Operations and support 847 1,041 1,186 Product development 1,425 1,502 1,722 Sales and marketing 1,186 1,516 1,763 General and administrative 836 950 2,025 Restructuring charges 113 89 — Total costs and expenses 5,563 6,597 8,399 Income from operations 429 1,802 1,518 Interest income 13 186 721 Interest expense ( 438 ) ( 24 ) ( 83 ) Other income (expense), net ( 304 ) 25 ( 54 ) Income (loss) before income taxes ( 300 ) 1,989 2,102 Provision for (benefit from) income taxes 52 96 ( 2,690 ) Net income (loss) $ ( 352 ) $ 1,893 $ 4,792 Net income (loss) per share attributable'

# Define output schema using Instructor

In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel
from pydantic import Field
from enum import Enum
from typing import Optional, Union, List

class UnitSuffix(str, Enum):
    billion = 'Billion'
    million = 'Million'
    thousand = 'Thousand'
    unknown = ''

class FiscalPeriod(str, Enum):
    fy_2023 = 'FY2023'
    fy_2022 = 'FY2022'
    fy_2021 = 'FY2021'
    fy_2020 = 'FY2020'
    unknown = ''

# Define our income statement
class IncomeStatement(BaseModel):
  period: Optional[FiscalPeriod]

  revenue: Union[float, str] = Field(description="Revenue")
  revenue_unit: Optional[UnitSuffix]

  cost_of_revenue: Union[float, str] = Field(description="Cost of revenue")
  cost_of_revenue_unit: Optional[UnitSuffix]

  income_from_operations: Union[float, str] = Field(description="Income from operations")
  income_from_operations_unit: Optional[UnitSuffix]

  operations_and_support: Union[float, str] = Field(description="Operations and support")
  operations_and_support_unit: Optional[UnitSuffix]

  product_development: Union[float, str] = Field(description="Product development")
  product_development_unit: Optional[UnitSuffix]

  sales_and_marketing: Union[float, str] = Field(description="Sales and marketing")
  sales_and_marketing_unit: Optional[UnitSuffix]

  general_and_administrative: Union[float, str] = Field(description="General and administrative")
  general_and_administrative_unit: Optional[UnitSuffix]

  interest_income: Union[float, str] = Field(description="Interest income")
  interest_income_unit: Optional[UnitSuffix]

  interest_expense: Union[float, str] = Field(description="Interest expense")
  interest_expense_unit: Optional[UnitSuffix]

  other_income: Union[float, str] = Field(description="Other income")
  other_income_unit: Optional[UnitSuffix]

  net_income: Union[float, str] = Field(description="Net income")
  net_income_unit: Optional[UnitSuffix]


class Financials(BaseModel):
  ticker: str
  income_statements: List[IncomeStatement]

# Download Mistral-7B from HuggingFace

In [None]:
!pip install -U -q llama-cpp-python huggingface-hub

In [None]:
import llama_cpp
from llama_cpp import Llama
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding

import instructor

from pydantic import BaseModel
from typing import List
from rich.console import Console
from huggingface_hub import hf_hub_download

# mixtral_path = "TheBloke/Mixtral-8x7B-v0.1-GGUF"
# mixtral_q4_basename = "mixtral-8x7b-v0.1.Q4_K_M.gguf"

mistral_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"

mistral_q4_basename = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

model_path = hf_hub_download(repo_id=mistral_path, filename=mistral_q4_basename)

llm = Llama(
    model_path=model_path,
    n_gpu_layers=--1, # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all
    n_batch = 2048, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_ctx=2048,
    logits_all=False,
)
llm.verbose = False

# Use Mistral-7B + Instructor to read Financials

In [None]:
import time

start = time.time()

response = create(
    response_model=instructor.Partial[Financials],
    messages=[
        {
            "role": "user",
            "content": f"Extract Airbnb's income statement from 2023, 2022, and 2021 from following context: {context}",
        },
    ],
)
print(f"Took {time.time() - start} seconds to complete!")
print(response.model_dump_json(indent=2))

Took 119.98298811912537 seconds to complete!
{
  "income_statements": [
    {
      "period": "FY2021",
      "revenue": 5992.0,
      "revenue_unit": "Million",
      "cost_of_revenue": 1156.0,
      "cost_of_revenue_unit": "Million",
      "income_from_operations": 429.0,
      "income_from_operations_unit": "Million",
      "operations_and_support": 847.0,
      "operations_and_support_unit": "Million",
      "product_development": 1425.0,
      "product_development_unit": "Million",
      "sales_and_marketing": 1186.0,
      "sales_and_marketing_unit": "Million",
      "general_and_administrative": 836.0,
      "general_and_administrative_unit": "Million",
      "interest_income": 13.0,
      "interest_income_unit": "Million",
      "interest_expense": -438.0,
      "interest_expense_unit": "Million",
      "other_income": -304.0,
      "other_income_unit": "Million",
      "net_income": -352.0,
      "net_income_unit": "Million"
    },
    {
      "period": "FY2022",
      "revenu