#### Steps
#### 1. Create VectorDB
#### 2. Design and register tools to access VectorDB
#### 3. Use Autogen to design roles (userproxy and toolproxy) for accessing infomation

In [1]:
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

emb_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


sec_filings_md_db = Chroma(persist_directory="./sec-filings-md-db",
                                embedding_function=emb_fn,
                                collection_name="sec_filings_md")

  from tqdm.autonotebook import tqdm, trange


In [2]:
query = "What are the risk factors differences bewteen AMD and NVDA"
docs = sec_filings_md_db.similarity_search(query)
print(docs)

[Document(page_content="The risks and uncertainties described below are not the only ones we face. If any of the following risks actually occurs, our business, financial condition or results of operations could be materially adversely affected. In addition, you should consider the interrelationship and compounding effects of two or more risks occurring *simultaneously.* Risk Factors **Summary** The following is a summary of the principal risks that could adversely affect our business, financial condition and results of operations. Economic and Strategic **Risks**\n- Intel Corporation's dominance of the microprocessor market and its aggressive business practices may limit our ability to compete effectively on a level playing field.  \n- The semiconductor industry is highly cyclical and has experienced severe downturns. - The demand for our products depends in part on the market conditions in the industries into which they are sold. There may be fluctuations in demand for our products or

In [3]:
print(docs[0].metadata)

{'Header 1': 'Advanced Micro Devices, Inc. (Exact Name Of Registrant As Specified In Its Charter)', 'Header 2': 'Item 1A. Risk Factors', 'filing_type': '10-Q1', 'ticker': 'AMD', 'year': '2024'}


In [4]:
earnings_call_db = Chroma(persist_directory="./earnings-call-db", embedding_function=emb_fn, collection_name="earnings_call")

In [5]:
# Retrieve all entries (this may vary depending on your actual ChromaDB implementation)
entries = earnings_call_db.get(include=["metadatas"])
print(entries)


{'ids': ['0086d9c4-7d6d-49a0-98cf-5a2bb06d7a19', '018c03f7-3012-488e-8f57-bb661fae4f77', '01d5c802-339b-49ed-aeac-77426b9d1089', '02af2301-db2d-4d2b-b3aa-64fb854611f7', '03c37994-f402-48be-9415-bc87bc448fcd', '0579d78a-787e-4df6-b935-eda5ff923baf', '064be7a3-3e2d-4f16-b086-3719bc4eee4d', '06893258-1bc1-443c-bda0-fe423cd50f64', '06e7d243-691f-4b48-ab79-e2715a32bc85', '06e8edbc-8ecd-402b-b35c-b6eefcb47a4e', '06ecf69d-1e19-4a08-82aa-18d0bd184f59', '0827e26b-8285-482a-a2b8-2ac5fda7054f', '08477bbf-b205-41a0-996e-3eb592595651', '08c991c4-f9d5-4196-ace7-d7508cb93fa4', '0ac1882b-e52a-47ea-8ab8-49d89400413a', '0b09bd0d-8d30-4648-a425-3cffb8f5136b', '0cb6d63e-cd70-4fde-b348-2bd0663d8059', '0d1efbcb-e37e-4967-ad34-e118a93f9d47', '0dcbfe2d-bdb3-4911-91bc-0c984b5a572f', '0ea8f2b9-f54f-4693-abdf-1b98dc65fd86', '0eb6a3f0-c423-4c0f-8194-6aec638219a5', '0eba0db5-c436-4c0b-ac6b-3db94e2eab1a', '0f6dedf5-b015-4309-88ce-90854d7a4c29', '1078fc6f-a5ce-4527-b1e5-c40b0cdf2e8d', '10c2eb93-0a3a-4a42-91dd-486a44

In [6]:
speakers_list_1 = []
speakers_list_2 = []
speakers_list_3 = []
speakers_list_4 = []

for meta in entries['metadatas']:
    for key, value in meta.items():
        if key == "quarter" and value == "Q1":
            speakers_list_1.append(meta['speaker'])
        elif key == "quarter" and value == "Q2":
            speakers_list_2.append(meta['speaker'])
        elif key == "quarter" and value == "Q3":
            speakers_list_3.append(meta['speaker'])
        elif key == "quarter" and value == "Q4":
            speakers_list_4.append(meta['speaker'])


In [7]:
quarter_speaker_dict = {
        "Q1":speakers_list_1,
        "Q2":speakers_list_2,
        "Q3":speakers_list_3,
        "Q4":speakers_list_4
    }

In [8]:
sec_form_names = ['10-K', '10-Q4', '10-Q3', '10-Q2', '10-Q1']

##### Design tools

In [9]:

global FROM_MARKDOWN
FROM_MARKDOWN = True


def query_database_earnings_call(
        question: str,
        quarter: str
    )->str:
        """This tool will query the earnings call transcripts database for a given question and quarter and it will retrieve
        the relevant text along from the earnings call and the speaker who addressed the relevant documents. This tool helps in answering questions
        from the earnings call transcripts.

        Args:
            question (str): _description_. Question to query the database for relevant documents.
            quarter (str): _description_. the financial quarter that is discussed in the question and possible options are Q1, Q2, Q3, Q4

        Returns:
            str: relevant text along from the earnings call and the speaker who addressed the relevant documents
        """
        assert quarter in ["Q1", "Q2", "Q3", "Q4"], "The quarter should be from Q1, Q2, Q3, Q4"

        req_speaker_list = []
        quarter_speaker_list = quarter_speaker_dict[quarter]

        for sl in quarter_speaker_list:
            if sl in question or sl.lower() in question:
                req_speaker_list.append(sl)
        if len(req_speaker_list) == 0:
            req_speaker_list = quarter_speaker_list

        relevant_docs = earnings_call_db.similarity_search(
            question,
            k=5,
            filter={
                "$and":[
                    {
                        "quarter":{"$eq":quarter}
                    },
                    {
                        "speaker":{"$in":req_speaker_list}
                    }
                ]
            }
        )

        speaker_releavnt_dict = {}
        for doc in relevant_docs:
            speaker = doc.metadata['speaker']
            speaker_text = doc.page_content
            if speaker not in speaker_releavnt_dict:
                speaker_releavnt_dict[speaker] = speaker_text
            else:
                speaker_releavnt_dict[speaker] += " "+speaker_text

        relevant_speaker_text = ""
        for speaker, text in speaker_releavnt_dict.items():
            relevant_speaker_text += speaker + ": "
            relevant_speaker_text += text + "\n\n"

        return relevant_speaker_text




def query_database_markdown_sec(
            question: str,
            sec_form_name: str
    )->str:
  assert sec_form_name in sec_form_names, f'The search form type should be in {sec_form_names}'

  relevant_docs = sec_filings_md_db.similarity_search(
      question,
      k=3,
      filter={
          "filing_type":{"$eq":sec_form_name}
      }
  )
  # print(relevant_docs)
  relevant_section_text = ""
  for relevant_text in relevant_docs:
      relevant_section_text += relevant_text.page_content + "\n\n"

  return relevant_section_text

In [10]:
def query_database_sec(
            question: str,
            sec_form_name: str
    )->str:
        """This tool will query the SEC Filings database for a given question and form name, and it will retrieve
        the relevant text along from the SEC filings and the section names. This tool helps in answering questions
        from the sec filings.

        Args:
            question (str): _description_. Question to query the database for relevant documents
            sec_form_name (str): _description_. SEC FORM NAME that the question is talking about. It can be 10-K for yearly data and 10-Q for quarterly data. For quarterly data, it can be 10-Q2 to represent Quarter 2 and similarly for other quarters.

        Returns:
            str: Relevant context for the question from the sec filings
        """
        if not FROM_MARKDOWN:
          return "No data available"
        elif FROM_MARKDOWN:
          return query_database_markdown_sec(question,sec_form_name)

In [11]:
sec_form_system_msg = ""
for sec_form in sec_form_names:
    if sec_form == "10-K":
        sec_form_system_msg+= "10-K for yearly data, "
    elif "10-Q" in sec_form:
        quarter = sec_form[-1]
        sec_form_system_msg+= f"{sec_form} for Q{quarter} data, "
sec_form_system_msg = sec_form_system_msg[:-2]

earnings_call_system_message = ", ".join(["Q1", "Q2", "Q3", "Q4"])

system_msg = f"""You are a helpful financial assistant and your task is to select the sec_filings or earnings_call or financial_books to best answer the question.
You can use query_database_sec(question,sec_form) by passing question and relevant sec_form names like {sec_form_system_msg}
or you can use query_database_earnings_call(question,quarter) by passing question and relevant quarter names with possible values {earnings_call_system_message}
or you can use query_database_books(question) to get relevant documents from financial textbooks about valuation and investing philosophies. When you are ready to end the coversation, reply TERMINATE"""

#### Start using AutoGen below

In [13]:
from autogen import ConversableAgent
import os
import json
with open('../../config.json', 'r') as file:
    config = json.load(file)


llm_config = {"model":"gpt-4-turbo"}

os.environ['OPENAI_API_KEY'] = config['NLP_KEY']
user_proxy = ConversableAgent(
    name = "Planner Admin",
    system_message=system_msg,
    code_execution_config=False,
    llm_config=llm_config,
    human_input_mode="NEVER",
    is_termination_msg=lambda msg: "TERMINATE" in msg["content"],
)
tool_proxy = ConversableAgent(
  name="Tool Proxy",
  system_message="Analyze the response from user proxy and decide whether the suggested database is suitable "
  ". Answer in simple yes or no",
  llm_config=False,
  # is_termination_msg=lambda msg: "exit" in msg.get("content",""),
  default_auto_reply="Please select the right database.",
  human_input_mode="ALWAYS",
  )

tools_dict = {
        "sec":[query_database_sec,"Tool to query SEC filings database"],
        "earnings_call": [query_database_earnings_call, "Tool to query earnings call transcripts database"],
    }

##### register functions

In [14]:
from autogen import register_function

for tool_name,tool in tools_dict.items():
  register_function(
      tool[0],
      caller=user_proxy,
      executor=tool_proxy,
      name = tool[0].__name__,
      description=tool[1]
  )

##### Start querying the agents with task

In [15]:
input_text = "What is the strategy of Google for artificial intelligence?"
chat_result = user_proxy.initiate_chat(
        recipient=tool_proxy,
        message=input_text,
        max_turns=10
    )

[33mPlanner Admin[0m (to Tool Proxy):

What is the strategy of Google for artificial intelligence?

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mTool Proxy[0m (to Planner Admin):

Please select the right database.

--------------------------------------------------------------------------------
[33mPlanner Admin[0m (to Tool Proxy):

I will query the database of earnings call transcripts for the most recent discussions about Google's strategy in artificial intelligence. This should provide the most current perspective on their AI strategy as discussed by their executives.

Let's go ahead with the query for the most recent information.
[32m***** Suggested tool call (call_OHN7eb0hhb6DLtHPITNZdrRS): query_random_financial_database_earnings_call *****[0m
Arguments: 
{"question":"What is Google's strategy for artificial intelligence?","quarter":"Q4"}
[32m*******

In [21]:
input_text = "Can you list the competitors of NVIDIA and compare the risk factors between NVIDIA and the others?"
chat_result = user_proxy.initiate_chat(
        recipient=tool_proxy,
        message=input_text,
        max_turns=10
    )

[33mPlanner Admin[0m (to Tool Proxy):

Can you list the competitors of NVIDIA and compare the risk factors between NVIDIA and the others?

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mTool Proxy[0m (to Planner Admin):

Please select the right database.

--------------------------------------------------------------------------------
[33mPlanner Admin[0m (to Tool Proxy):

[32m***** Suggested tool call (call_fk6CrPQvwtuaTXwRIPSgtvBT): query_database_sec *****[0m
Arguments: 
{"question":"competitors of NVIDIA","sec_form_name":"10-K"}
[32m***********************************************************************************[0m

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[35m
>>>>>>>> EXECUTING FUNCTION query_database_sec...[0m
[33mTool Prox

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4-turbo-preview in organization org-eqcEEyGEnRp8STRHrOjzEdAg on tokens per min (TPM): Limit 30000, Requested 43625. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}