In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.llms import OpenAI

from dataclasses import dataclass
import os

In [3]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 600,
    chunk_overlap  = 30,
    length_function = len,
)

In [5]:
SEC_API_KEY = os.environ["SEC_API_KEY"]

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from sec_api import ExtractorApi, QueryApi

In [8]:
from enum import Enum

class FilingType(Enum):
    T_10K = 1
    T_10Q = 2

In [9]:
query_api = QueryApi(SEC_API_KEY)
extractor_api = ExtractorApi(SEC_API_KEY)

In [10]:
@dataclass
class SECFiling:
    """Represents metadata about an SEC filing for extraction."""
    url: str
    filing_type: FilingType
    ticker: str

In [11]:
latest_10x = {
    "GOOG": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm",
        filing_type=FilingType.T_10K,
        ticker="GOOG"),
    "MSFT": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000156459023000733/msft-10q_20221231.htm",
        filing_type=FilingType.T_10Q,
        ticker="MSFT"),
    "AMZN": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/1018724/000101872423000004/amzn-20221231.htm",
        filing_type=FilingType.T_10K,
        ticker="AMZN"),
    "AAPL": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019323000006/aapl-20221231.htm",
        filing_type=FilingType.T_10Q,
        ticker="AAPL"),
    "NVDA": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/1045810/000104581022000166/nvda-20221030.htm",
        filing_type=FilingType.T_10Q,
        ticker="NVDA"),
    "META": SECFiling(
        url="https://www.sec.gov/ix?doc=/Archives/edgar/data/1326801/000132680123000013/meta-20221231.htm",
        filing_type=FilingType.T_10K,
        ticker="META"),
}

# [Reference](https://pypi.org/project/sec-api/#10-k10-q8-k-section-extractor-api)

https://pypi.org/project/sec-api/#10-k10-q8-k-section-extractor-api 

In [12]:
TO_EXTRACT = {
    FilingType.T_10K: [
        "7", 
    ],
    FilingType.T_10Q: [
        "part1item2", # Management’s Discussion and Analysis of Financial Condition and Results of Operations
    ],
}

In [13]:
# for f in filings["filings"]:
#     print(f["formType"], f["filedAt"], f["linkToHtml"])
#     google_10k_filings[f["filedAt"][:4]] = f["linkToHtml"]
# google_10k_filings

In [14]:
import pickle
import os

In [15]:
from pathlib import Path
parent_pth = Path.cwd()
parent_pth

PosixPath('/Users/timothywee/projects/10k/10kai_backend_railway')

In [16]:
import re

def clean_text(text):
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
    text = re.sub(r"&#(\d+);", "", text)
    # Fix newlines in the middle of sentences
    text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
    # Remove multiple newlines
    text = re.sub(r"\n\s*\n", " ", text)
    return text

In [18]:
for ticker, filing in latest_10x.items():
    all_text = []
    for section in TO_EXTRACT[filing.filing_type]:
        text = extractor_api.get_section(filing.url, section, "text")
        text = clean_text(text)
        all_text.append(text)
    texts = recursive_text_splitter.split_text(" ".join(all_text))
    embeddings = OpenAIEmbeddings()
    docsearch = FAISS.from_texts(texts, embeddings)
    docsearch.save_local(str(parent_pth.joinpath(f"{ticker}.idx")))
    docsearch.index = None
    with open(parent_pth.joinpath(f"{ticker}.pkl"), "wb") as f:
        pickle.dump(docsearch, f, protocol=5)

In [25]:
# Hypothetical embeddings

for ticker, filing in latest_10x.items():
    all_text = []
    for section in TO_EXTRACT[filing.filing_type]:
        text = extractor_api.get_section(filing.url, section, "text")
        text = clean_text(text)
        all_text.append(text)
    texts = recursive_text_splitter.split_text(" ".join(all_text))
    embeddings = OpenAIEmbeddings()
    
    hype_docsearch = FAISS.from_texts(texts, \
        HypotheticalDocumentEmbedder.from_llm(OpenAI(temperature=0), embeddings, "web_search"))
    hype_docsearch.save_local(str(parent_pth.joinpath(f"{ticker}_hype.idx")))
    hype_docsearch.index = None
    with open(parent_pth.joinpath(f"{ticker}_hype.pkl"), "wb") as f:
        pickle.dump(hype_docsearch, f, protocol=5)

Exception: API error: 429 - {"status":429,"error":"You send a lot of requests. We like that. But you exceeded the free query limit of 100 requests. Upgrade your account to get unlimited access. Visit sec-api.io for more."}

# Load docsearch from pickle and test

In [26]:
def load_docsearch(pth, fname):
    with open(os.path.join(pth, f"{fname}.pkl"), "rb") as f:
        docsearch = pickle.load(f)
        docsearch.load_local(os.path.join(pth, f"{fname}.idx"))
        return docsearch

In [27]:
meta_hype = load_docsearch(str(parent_pth), "META")

In [28]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [29]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

def search_doc(q, docsearch, **kw_args):
    docs = docsearch.similarity_search(q)
    return chain({"input_documents": docs, "question": q}, **kw_args)

In [30]:
search_doc("How is Stories doing?", meta_hype, return_only_outputs=False)

{'input_documents': [Document(page_content='it is not currently monetized at the same rate as our feed or Stories products. We also have seen fluctuations and declines in the size of our active user base in one or more markets from time to time. For example, in connection with the war in Ukraine, access to Facebook and Instagram was restricted in Russia and the services were then prohibited by the Russian government, which adversely affected user growth and engagement in 2022 . These trends adversely affected advertising revenue in 2022 , and we expect will continue to affect our advertising revenue in the foreseeable future.   T able of Contents   The COVID-19 pandemic has also impacted our business and results of operations, with a varied impact on user', lookup_str='', metadata={}, lookup_index=0),
  Document(page_content="we believe will continue to have, an impact on our user growth and engagement and advertising revenue. In particular, we believe advertising budgets have been pre

In [32]:
search_doc("Did Apple affect Facebook's business?", meta_hype, return_only_outputs=False)

{'input_documents': [Document(page_content="to limit the ability of websites and application developers to collect and use these signals to target and measure advertising. For example, in 2021, Apple made certain changes to its products and data use policies in connection with changes to its iOS operating system that reduce our and other iOS developers' ability to target and measure advertising, which has negatively impacted, and we expect will continue to negatively impact, the size of the budgets marketers are willing to commit to us and other advertising platforms.   To mitigate these developments, we are working to evolve our advertising systems to improve the performance of our ad products. We are developing privacy enhancing technologies to", lookup_str='', metadata={}, lookup_index=0),
  Document(page_content='it is not currently monetized at the same rate as our feed or Stories products. We also have seen fluctuations and declines in the size of our active user base in one or m

In [33]:
search_doc("What investments will Meta make for AI?", meta_hype, return_only_outputs=False)

{'input_documents': [Document(page_content='including developing virtual and augmented reality devices, software for social platforms, neural interfaces, and other foundational technologies for the metaverse. Our RL investments include expenses relating to headcount and technology development across these efforts. Many of our RL investments are directed toward long-term, cutting-edge research and development for products for the metaverse that are not on the market today and may only be fully realized in the next decade. Although it is inherently difficult to predict when and how the metaverse ecosystem will develop, we expect our RL segment to continue to operate at a loss for the foreseeable future, and our ability to support our', lookup_str='', metadata={}, lookup_index=0),
  Document(page_content='play in the world.   We anticipate that investments in our data center capacity, servers, network infrastructure, and headcount will continue to drive expense growth in 2023, which will 

In [34]:
search_doc("What will the company focus on this year?", meta_hype, return_only_outputs=False)

{'input_documents': [Document(page_content='with respect to key metrics used by management in operating our business.   Executive Overview of Full Year 2022 Results   Our mission is to give people the power to build community and bring the world closer together. In 2022, we continued to focus on our main revenue growth priorities: (i) helping marketers use our products to connect with consumers and (ii) making our ads more relevant and effective. We also continued to invest in both our family of apps and our metaverse efforts based on our company priorities.   Our financial results and key community metrics for 2022 are set forth below. Our total revenue for 2022 was $116.61 billion, a decrease of 1% compared to 2021, which reflects a $5.96 billion negative', lookup_str='', metadata={}, lookup_index=0),
  Document(page_content="the exact impact that each trend had on our advertising revenue during the periods presented.   Investment Philosophy   In 2022 , we continued to invest based o

In [35]:
search_doc("Did Meta have layoffs this year?", meta_hype, return_only_outputs=False)

{'input_documents': [Document(page_content='Reality Labs   RL revenue in 2022 decreased $115 million, or 5%, compared to 2021. The decrease in RL revenue was driven by a decrease in the volume of Meta Quest sales.   Revenue Seasonality and Customer Concentration   Revenue is traditionally seasonally strong in the fourth quarter of each year due in part to seasonal holiday demand. We believe that this seasonality in both advertising revenue and RL consumer hardware sales affects our quarterly results, which generally reflect significant growth in revenue between the third and fourth quarters and a decline between the fourth and subsequent first quarters. For instance, our total revenue increased 16%, 16%, and 31% between the third and fourth quarters', lookup_str='', metadata={}, lookup_index=0),
  Document(page_content='n 18% year-over-year increase in ad impressions delivered across our Family of Apps.   Income from operations for 2022 was $28.94 billion, a decrease of $17.81 billion,