In [0]:
%pip install langchain==0.1.17 databricks-vectorsearch==0.33 mlflow==2.12.1 lark==1.1.9
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting langchain==0.1.17
  Obtaining dependency information for langchain==0.1.17 from https://files.pythonhosted.org/packages/c8/bc/607cd3254800a26b60da9e2ca6b10785e60170db7e85dc3d0328b5ab3a9c/langchain-0.1.17-py3-none-any.whl.metadata
  Downloading langchain-0.1.17-py3-none-any.whl.metadata (13 kB)
Collecting databricks-vectorsearch==0.33
  Obtaining dependency information for databricks-vectorsearch==0.33 from https://files.pythonhosted.org/packages/93/22/5c0c2eea1e5d7c6789d67ec562bb180841d4e7ef5f349efc2783adac2518/databricks_vectorsearch-0.33-py3-none-any.whl.metadata
  Downloading databricks_vectorsearch-0.33-py3-none-any.whl.metadata (2.8 kB)
Collecting mlflow==2.12.1
  Obtaining dependency information for mlflow==2.12.1 from https://files.pythonhosted.org/packages/54/93/0cd6533717c9e0b590d97887c0f9b0cea6185546e993e25098c09c13fa23/mlflow-2.12.1-py3-none-any.whl.met

In [0]:
from databricks.vector_search.client import VectorSearchClient
from langchain_community.vectorstores import DatabricksVectorSearch
from langchain_community.embeddings import DatabricksEmbeddings
from langchain_community.chat_models import ChatDatabricks
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [0]:
vs_endpoint_name = "edgar_vs_endpoint"
vs_index_fullname = "llm_hackathon.default.edgar_form_vs_index"
vsc = VectorSearchClient()

embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

vs_index = vsc.get_index(
    endpoint_name=vs_endpoint_name,
    index_name=vs_index_fullname
)

vectorstore = DatabricksVectorSearch(
    vs_index, text_column="content", embedding=embedding_model,
    columns=['name', 'tickers', 'exchanges', 'form', 'filing_date', 'industry']
)

chat_model = ChatDatabricks(endpoint="databricks-dbrx-instruct", temperature=0)

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True to VectorSearchClient().


embedding model is not used in delta-sync index with Databricks-managed embeddings.


In [0]:
metadata_field_info = [
    AttributeInfo(
        name="name",
        description="The name of the company",
        type="string",
    ),
    AttributeInfo(
        name="tickers",
        description="The ticker symbols of the company",
        type="string",
    ),
    AttributeInfo(
        name="exchanges",
        description="The stock exchange where the stock is traded",
        type="string",
    ),
    AttributeInfo(
        name="form",
        description="The filing forms (10-K or 10-Q)",
        type="string"
    ),
    AttributeInfo(
        name="filing_date",
        description="The filing date of the form",
        type="date"
    ),
    AttributeInfo(
        name="industry",
        description="The industry of the company",
        type="string"
    ),
]

document_content_description = "The sec filing of financial statements and report of the company."

retriever = SelfQueryRetriever.from_llm(
    chat_model, vectorstore, document_content_description, metadata_field_info, search_kwargs={"k" : 10},
)

In [0]:
retriever.invoke("what was alphabet's revenue?")

[Document(page_content='securities, cash flows from operations and financing activities to continue to be sufficient to fund our operating activities and cash commitments for investing and financing activities for at least the next 12 months and thereafter for the foreseeable future. Capital Expenditures and LeasesWe make investments in land and buildings for data centers and offices and information technology assets through purchases of property and equipment and lease arrangements to provide capacity for the growth of our services and products. Capital ExpendituresOur capital investments in property and equipment consist primarily of the following major categories:•technical infrastructure, which consists of our investments in servers and network equipment for computing, storage, and networking requirements for ongoing business activities, including AI, (collectively referred to as our information technology assets) and data center land and building construction; and•office facilitie

In [0]:
RAG_TEMPLATE = """\
You are a finance chatbot. Only use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

RAG_TEMPLATE = """\
You are an assistant for financial analyst. You are answering finance question about company's news, stock and financial reports (10-K,  10-Q forms). If the question is not related to one of these topics, kindly decline to answer. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Use the following pieces of context to answer the question at the end:
{context}

Question: {question}
Answer:
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [0]:
self_query_retrieval_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [0]:
self_query_retrieval_chain.invoke({'question': "what was alphabet's revenue?"})

{'response': AIMessage(content="I don't have real-time data access, but based on the provided documents, I couldn't find the specific information about Alphabet's revenue for the period you're interested in. The documents provided seem to focus on other financial aspects such as income taxes payable, purchase commitments, capital expenditures, and stock-based award activities. I recommend checking Alphabet's official financial reports or other trusted financial sources for the most accurate and up-to-date revenue information.", response_metadata={'prompt_tokens': 3706, 'completion_tokens': 87, 'total_tokens': 3793}, id='run-a3a26ebb-cdf6-4dc2-8cea-b83fe76565e2-0'),
 'context': [Document(page_content='of\xa0March\xa031, 2024, we had income taxes payable of $4.2 billion, of which $2.1\xa0billion was short-term, related to a one-time transition tax payable incurred as a result of the U.S. Tax Cuts and Jobs Act ("Tax Act"). As permitted by the Tax Act, we will pay the transition tax in ann

In [0]:
self_query_retrieval_chain.invoke({'question': "what was alphabet's revenue?"})

{'response': AIMessage(content="Based on the provided documents, Alphabet Inc.'s revenue for the three months ended March 31, 2024 was $80,539 million.", response_metadata={'prompt_tokens': 5400, 'completion_tokens': 30, 'total_tokens': 5430}, id='run-043b6d36-7788-45a7-91b6-5efb9e4c2c23-0'),
 'context': [Document(page_content='|  |  |  |  |  |  |  |  |  |  |  |  |  | | |  | | |  | | |  | | |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n|  | | | Three Months Ended | | | | | | | | |  | | |  | | |\n|  | | | March 31, | | | | | | | | |  | | |  | | |\n|  | | | 2023 | | |  | | | 2024 | | |  | | |  | | |  | | |  | | |\n| Research and development expenses | | | $ | 11,468 |  |  | | | $ | 11,903 |  |  | | |  | | |  | | |  | | |\n| Research and development expenses as a percentage of revenues | | | 16 | | % |  | | | 15 | | % |  | | |  | | |  | | |  | | |  \nR&D expenses increased $435 million 

In [0]:
self_query_retrieval_chain.invoke({'question': "how did alphabet perform financially?"})

{'response': AIMessage(content="Based on the provided documents, Alphabet Inc.'s financial performance for the three months ended March 31, 2024, showed an increase in revenues compared to the same period in 2023. The company's total revenues for the three months ended March 31, 2024, were $80,539 million, up from $69,787 million in the same period of 2023. The increase in revenues was primarily driven by growth in Google Search & other revenues, YouTube ads revenues, and Google Cloud revenues. Research and development expenses also increased from $11,468 million in the three months ended March 31, 2023, to $11,903 million in the same period of 2024. The company's financial performance may be affected by various factors, including economic and political conditions, regulatory scrutiny, and changes in laws and public policies.", response_metadata={'prompt_tokens': 5109, 'completion_tokens': 175, 'total_tokens': 5284}, id='run-f408c9d5-1fdb-454b-8281-915c99e3fc85-0'),
 'context': [Docume

In [0]:
self_query_retrieval_chain.invoke({'question': "what is alphabet's next big bet?"})

{'response': AIMessage(content='Based on the provided documents, Alphabet has not announced any new big bets or major investments in a specific area or project. However, the company has initiated a cash dividend program, with the first dividend of $0.20 per share to be paid on June 17, 2024, to stockholders of record as of June 10, 2024. Additionally, Alphabet has repurchased significant amounts of its Class A and Class C shares, with $3.4 billion and $12.7 billion repurchased, respectively, in the first quarter of 2024. The company has also authorized the repurchase of up to an additional $70.0 billion of its Class A and Class C shares.', response_metadata={'prompt_tokens': 5696, 'completion_tokens': 143, 'total_tokens': 5839}, id='run-abbeec1a-ac90-4add-b62e-5f0b1d1dd880-0'),
 'context': [Document(page_content='---  \nOther Information•On April 25, 2024, the Board of Directors of Alphabet approved the initiation of a cash dividend program, and declared a cash dividend of $0.20 per sh