In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import nest_asyncio
import warnings
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
warnings.filterwarnings('ignore')
nest_asyncio.apply()

In [3]:
from llama_index.core import (
    VectorStoreIndex,
    Document
)
from llama_index.core.tools.tool_spec.base import BaseToolSpec
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.postprocessor.longllmlingua import LongLLMLinguaPostprocessor

from sec_api import QueryApi
import requests
from bs4 import BeautifulSoup


import sys
sys.path.append("../src")
from llamaindex_config import llm, embed_model, text_splitter

from IPython.display import Markdown, display

In [4]:
llm = llm
embed_model = embed_model
text_splitter = text_splitter

In [5]:
from typing import Any, Dict, List, Optional

from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle, TextNode
import torch

DEFAULT_INSTRUCTION_STR = "Given the context, please answer the final question"

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class LLMLinguaPostProcessor2(LongLLMLinguaPostprocessor):
    """Optimization of nodes.

    Compress using LongLLMLingua2 paper.

    """

    metadata_mode: MetadataMode = Field(
        default=MetadataMode.ALL, description="Metadata mode."
    )
    instruction_str: str = Field(
        default=DEFAULT_INSTRUCTION_STR, description="Instruction string."
    )
    target_token: int = Field(
        default=300, description="Target number of compressed tokens."
    )
    rank_method: str = Field(default="longllmlingua", description="Ranking method.")
    additional_compress_kwargs: Dict[str, Any] = Field(
        default_factory=dict, description="Additional compress kwargs."
    )

    _llm_lingua: Any = PrivateAttr()

    def __init__(
        self,
        model_name: str = "microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
        device_map: str = device,
        model_config: Optional[dict] = {},
        open_api_config: Optional[dict] = {},
        metadata_mode: MetadataMode = MetadataMode.ALL,
        instruction_str: str = DEFAULT_INSTRUCTION_STR,
        target_token: int = 500,
        rank_method: str = "longllmlingua",
        additional_compress_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """LongLLMLingua Compressor for Node Context."""
        from llmlingua import PromptCompressor

        open_api_config = open_api_config or {}
        additional_compress_kwargs = additional_compress_kwargs or {}

        if model_name != "microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank":
            use_llmlingua2 = False
            model_config = {"revision": "main"}
        else:
            use_llmlingua2 = True
        
        self._llm_lingua = PromptCompressor(
            model_name=model_name,
            device_map=device_map,
            model_config=model_config,
            open_api_config=open_api_config,
            use_llmlingua2 = use_llmlingua2,
        )
        super().__init__(
            metadata_mode=metadata_mode,
            instruction_str=instruction_str,
            target_token=target_token,
            rank_method=rank_method,
            additional_compress_kwargs=additional_compress_kwargs,
        )

## Define tool

In [8]:
class SECTool(BaseToolSpec):
    """Tools to read SEC10K reports"""
    
    spec_functions=[
        "search_10q_10k",
    ]
    
    def __init__(self, 
                 sec_api_key = os.getenv('SEC_API_KEY'),
                 cohere_api_key = os.getenv('COHERE_API_KEY')
                 ):
        """Initialize SEC tool"""
        self.sec_api_key = sec_api_key
        self.cohere_api_key = cohere_api_key
        if self.sec_api_key is None:
            raise ValueError("SEC API key cannot be none")
        if self.cohere_api_key is None:
            raise ValueError("Cohere API key cannot be none")
        self.queryApi = QueryApi(api_key=self.sec_api_key)
        self.reranker = CohereRerank(top_n = 4, api_key = self.cohere_api_key)
        self.prompt_compressor = LongLLMLinguaPostprocessor(
            instruction_str = "Given the context, please answer the final question",
            target_token = 300,
            rank_method = "longllmlingua",
            additional_compress_kwargs = {
                "condition_compare": True,
                "condition_in_question": "after",
                "context_budget": "+100",
                "reorder_context": "sort", #enables document reorder
                "dynamic_context_compression_ratio": 0.3,
            },
            model_name = "gpt2"
        )
        
    @staticmethod
    def _download_form_html(url: str):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7',
            'Cache-Control': 'max-age=0',
            'Dnt': '1',
            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"macOS"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        return response.text
    
    def get_retriever_from_url(self, url: str, embed_model=embed_model):   
        """Creates an in-memory retriever from a URL"""
        text = self._download_form_html(url=url)
        soup = BeautifulSoup(text, 'html.parser')
        texts = soup.get_text()
        nodes = text_splitter.get_nodes_from_documents([Document(text=texts)])
        return VectorStoreIndex(nodes, embed_model=embed_model).as_retriever(
            similarity_top_k = 10
        )
    
    def return_contexts(self, url: str, question: str):
        """Retrieves and reranks nodes given a query string and a url 
        from an in-memory vector index"""
        retriever = self.get_retriever_from_url(url = url)
        nodes = retriever.retrieve(question)
        reranked_nodes = self.reranker.postprocess_nodes(
            nodes = nodes,
            query_str = question)
        refined_nodes = self.prompt_compressor.postprocess_nodes(
            nodes = reranked_nodes,
            query_str = question
        )
        return "\n\n".join([n.get_content() for n in refined_nodes])
    
    def search_10q_10k(
        self, 
        ticker: str, 
        question: str,
        tenq: bool = True):
        """
        Useful to search information from the latest 10-Q or 10-K forms of a
        given stock.
        args:
            ticker (str): ticker of interest
            query (str): the question of interest
            tenq (bool): Whether or not to search the 10-Q form

        """
        if tenq is True:
            query = {
                "query": {
                    "query_string": {
                        "query": f"ticker:{ticker} AND formType:\"10-Q\""
                    }
                },
                "from": "0",
                "size": "1",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
        else:
            query = {
            "query": {
                "query_string": {
                "query": f"ticker:{ticker} AND formType:\"10-K\""
                }
            },
            "from": "0",
            "size": "1",
            "sort": [{ "filedAt": { "order": "desc" }}]
            }
        filings = self.queryApi.get_filings(query)['filings']
        if len(filings) == 0:
            return "Sorry I couldn't find any filing for this stock, check if ticker is correct"
        link = filings[0]['linkToFilingDetails']
        return self.return_contexts(url=link, question=question)

In [7]:
secTool = SECTool()
SECToolList = secTool.to_tool_list()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
from llama_index.core.agent import (
    FunctionCallingAgentWorker,
    AgentRunner
)
agent_worker = FunctionCallingAgentWorker.from_tools(
    tools = SECToolList,
    llm = llm,
    verbose = True)
agent = AgentRunner(agent_worker=agent_worker)

In [13]:
query = "What are the risks that Illumina faces?"
response = agent.chat(query)

display(Markdown(f"<b>{response}</b>"))

Added user message to memory: What are the risks that Illumina faces?
=== Calling Function ===
Calling function: search_10q_10k with args: {"ticker": "ILMN", "question": "What are the risks that Illumina faces?", "tenq": true}


Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 1024). Running this sequence through the model will result in indexing errors


=== Function Output ===
On 20, we action the Court EC Div. The Decision, any order decision the FTC any other or to which Illumina is required to GRAIL (an FTC Decision), and-app applicable and our have imposed in the and future significant additional on legal, financial, other additional may in loss other on condition operations Such effects could include divIL terms that materially which weIL., we not or financial suchment which result in negative consequences. For we to be G sale and so would tax received and tax in ( basis is and $ million In will likely inparty contracts agreements including contingent rightsthe) us as of the may unable to discharge with divest, and divestiture. extent that the following a divest, may more estimating the future liabilities anyVR G., G may be by such which the The Divment Decision to that G has to to5 of operations- We expect funding1, from G’s sheet Inter Measures, the Order Div an FTC Div Decision other order any governmental or-app the applicabl

<b>Based on the information from Illumina's 10-Q filings, some of the key risks that Illumina faces include:

1. Regulatory risks: Illumina faces significant regulatory risks related to its proposed acquisition of GRAIL. The FTC has filed an administrative complaint to block the acquisition, and Illumina may be required to divest GRAIL or face other significant legal, financial, and operational consequences if the acquisition is not approved. This could have material negative impacts on Illumina's business.

2. Financial and operational impacts: A divestiture of GRAIL or other adverse regulatory outcomes could result in significant financial and operational challenges for Illumina, including loss of revenue, tax implications, and disruption to existing contracts and agreements.

3. Uncertainty and unpredictability: Illumina is unable to predict the full extent of the adverse consequences it may face from regulatory actions related to the GRAIL acquisition. This uncertainty poses a significant risk to the company's future business and financial performance.

4. Potential fines and penalties: Illumina may face additional fines or penalties from regulatory authorities in the U.S. and Europe related to the GRAIL acquisition, further straining the company's financial position.

In summary, the key risks center around the regulatory challenges Illumina faces with the GRAIL acquisition, which could have wide-ranging negative impacts on the company's finances, operations, and overall business outlook.</b>

Prompt compression works and latency is improved!

## Export

In [None]:
%%writefile ../tools/sec_tools.py

#%%
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

from llama_index.core import (
    VectorStoreIndex,
    Document
)
from llama_index.core.tools.tool_spec.base import BaseToolSpec
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.postprocessor.longllmlingua import LongLLMLinguaPostprocessor

from sec_api import QueryApi
import requests
from bs4 import BeautifulSoup

import sys
__curdir__ = os.getcwd()

if "tools" in __curdir__:
    sys.path.append(os.path.join(
        __curdir__,
        "../src"
    ))
else:
    sys.path.append("./src")

from llamaindex_config import llm, embed_model, text_splitter

llm = llm
embed_model = embed_model
text_splitter = text_splitter
#%%
class SECTool(BaseToolSpec):
    """Tools to read SEC10K reports"""
    
    spec_functions=[
        "search_10q_10k",
    ]
    
    def __init__(self, 
                 sec_api_key = os.getenv('SEC_API_KEY'),
                 cohere_api_key = os.getenv('COHERE_API_KEY')
                 ):
        """Initialize SEC tool"""
        self.sec_api_key = sec_api_key
        self.cohere_api_key = cohere_api_key
        if self.sec_api_key is None:
            raise ValueError("SEC API key cannot be none")
        if self.cohere_api_key is None:
            raise ValueError("Cohere API key cannot be none")
        self.queryApi = QueryApi(api_key=self.sec_api_key)
        self.reranker = CohereRerank(top_n = 4, api_key = self.cohere_api_key)
        self.prompt_compressor = LongLLMLinguaPostprocessor(
            instruction_str = "Given the context, please answer the final question",
            target_token = 300,
            rank_method = "longllmlingua",
            additional_compress_kwargs = {
                "condition_compare": True,
                "condition_in_question": "after",
                "context_budget": "+100",
                "reorder_context": "sort", #enables document reorder
                "dynamic_context_compression_ratio": 0.3,
            },
            model_name = "gpt2"
        )

    @staticmethod
    def _download_form_html(url: str):
        """Function to download text from SEC website"""
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7',
            'Cache-Control': 'max-age=0',
            'Dnt': '1',
            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"macOS"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        return response.text
    
    def get_retriever_from_url(self, url: str, embed_model=embed_model):   
        """Creates an in-memory retriever from a URL"""
        text = self._download_form_html(url=url)
        soup = BeautifulSoup(text, 'html.parser')
        texts = soup.get_text()
        nodes = text_splitter.get_nodes_from_documents([Document(text=texts)])
        return VectorStoreIndex(nodes, embed_model=embed_model).as_retriever(
            similarity_top_k = 10
        )
    
    def return_contexts(self, url: str, question: str):
        """Retrieves and reranks nodes given a query string and a url 
        from an in-memory vector index"""
        retriever = self.get_retriever_from_url(url = url)
        nodes = retriever.retrieve(question)
        reranked_nodes = self.reranker.postprocess_nodes(
            nodes = nodes,
            query_str = question)
        refined_nodes = self.prompt_compressor.postprocess_nodes(
            nodes = reranked_nodes,
            query_str = question
        )
        return "\n\n".join([n.get_content() for n in refined_nodes])
    
    def search_10q_10k(
        self, 
        ticker: str, 
        question: str,
        tenq: bool = True):
        """
        Useful to search information from the latest 10-Q or 10-K forms of a
        given stock.
        args:
            ticker (str): ticker of interest
            query (str): the question of interest
            tenq (bool): Whether or not to search the 10-Q form
        """
        if tenq is True:
            query = {
                "query": {
                    "query_string": {
                        "query": f"ticker:{ticker} AND formType:\"10-Q\""
                    }
                },
                "from": "0",
                "size": "1",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
        else:
            query = {
            "query": {
                "query_string": {
                "query": f"ticker:{ticker} AND formType:\"10-K\""
                }
            },
            "from": "0",
            "size": "1",
            "sort": [{ "filedAt": { "order": "desc" }}]
            }
        filings = self.queryApi.get_filings(query)['filings']
        if len(filings) == 0:
            return "Sorry I couldn't find any filing for this stock, check if ticker is correct"
        link = filings[0]['linkToFilingDetails']
        return self.return_contexts(url=link, question=question)

def get_sec_tool():
    """Return SEC tool powered by SEC Edgar Filings API"""
    secTool = SECTool()
    return secTool.to_tool_list()
