#### Steps
#### 1. Create VectorDB
#### 2. Design and register tools to access VectorDB
#### 3. Use Autogen to design roles (userproxy and toolproxy) for accessing infomation

In [1]:
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

emb_fn = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")


sec_filings_md_db = Chroma(persist_directory="./sec-filings-md-db_v2",
                                embedding_function=emb_fn,
                                collection_name="sec_filings_md_v2")

  from tqdm.autonotebook import tqdm, trange


In [2]:
query = "What are the risk factors differences bewteen AMD and NVDA"
docs = sec_filings_md_db.similarity_search(query)
print(docs)

[Document(page_content="AMD's ability to access capital markets; AMD's ability to obtain sufficient external financing on favorable terms, or at *all;*", metadata={'filing_type': '10-Q1', 'is_table': False, 'ticker': 'AMD', 'year': '2024'}), Document(page_content='Other than the risk factors listed below, there have been no material changes from the risk factors previously described under Item 1A of our Annual Report on Form 10-K for the fiscal year ended January 28, 2024. Purchasing or owning NVIDIA common stock involves investment risks including, but not limited to, the risks described in Item 1A of our Annual Report on Form 10-K for the fiscal year ended January 28, 2024, and below', metadata={'filing_type': '10-Q2', 'is_table': False, 'ticker': 'NVDA', 'year': '2024'}), Document(page_content='© 2024 NVIDIA Corporation. All rights reserved. The following discussion and analysis of our financial condition and results of operations should be read in conjunction with the risk factors 

In [3]:
print(docs[0].metadata)

{'filing_type': '10-Q1', 'is_table': False, 'ticker': 'AMD', 'year': '2024'}


In [4]:
earnings_call_db = Chroma(persist_directory="./earnings-call-db_v2", embedding_function=emb_fn, collection_name="earnings_call_v2")

In [5]:
# Retrieve all entries (this may vary depending on your actual ChromaDB implementation)
entries = earnings_call_db.get(include=["metadatas"])
print(entries)


{'ids': ['00139f04-2f6b-4319-abd2-315eee1132de', '004ccb02-52d0-447e-a7fb-c17d59356890', '00608134-0bef-4cb0-a4f1-3939317e8378', '00a52f24-f699-4acc-bd35-ed5adc05ac2e', '00bb4211-22ac-4e33-a78d-241b7dba6b34', '00e2b3c0-83b4-4d2f-a82c-c851e0269dad', '00f0044d-ce67-46e0-8c93-7917589fe824', '010eaae7-5e27-4b48-ac4b-cf65e5363794', '01288552-f62f-4890-85b8-ea1825f714fc', '01755cb3-1468-414f-bc9c-eed4e1befc3b', '017fcc97-566b-43ac-80b3-6247251247e6', '018bd295-4abb-4386-8b7a-a5d05db08376', '01bbfb9d-ddac-47a2-9b65-8f70cde575ad', '01cde09a-9366-45f1-8d8d-8f654d94e62a', '01da3671-3d73-4f2b-a3f3-466d33fb4040', '022536bc-51e6-4b80-885c-99680edfd519', '022df573-8228-4d7f-a2ed-75074e5a29b2', '028e649b-7b40-47ac-9acd-6c42814b5dc5', '02caf4f6-c6b0-43a1-9365-dab32dce11e8', '02d35f7d-145c-43bb-ab34-dc5fe8d56015', '031822bd-b217-47f7-8bf2-de3ba3cc5891', '031d7bc0-e231-4124-a9f6-9db6173f6c15', '03454596-d869-4e9a-b135-9129450f45c0', '03791b01-fc51-4221-a726-ff42ae871a66', '03a10c10-2902-42c2-9b31-4a1d57

In [6]:
speakers_list_1 = []
speakers_list_2 = []
speakers_list_3 = []
speakers_list_4 = []

for meta in entries['metadatas']:
    for key, value in meta.items():
        if key == "quarter" and value == "Q1":
            speakers_list_1.append(meta['speaker'])
        elif key == "quarter" and value == "Q2":
            speakers_list_2.append(meta['speaker'])
        elif key == "quarter" and value == "Q3":
            speakers_list_3.append(meta['speaker'])
        elif key == "quarter" and value == "Q4":
            speakers_list_4.append(meta['speaker'])


In [7]:
quarter_speaker_dict = {
        "Q1":speakers_list_1,
        "Q2":speakers_list_2,
        "Q3":speakers_list_3,
        "Q4":speakers_list_4
    }

In [8]:
sec_form_names = ['10-K', '10-Q4', '10-Q3', '10-Q2', '10-Q1']

##### Design tools

In [9]:

global FROM_MARKDOWN
FROM_MARKDOWN = True


def query_database_earnings_call(
        question: str,
        quarter: str
    )->str:
        """This tool will query the earnings call transcripts database for a given question and quarter and it will retrieve
        the relevant text along from the earnings call and the speaker who addressed the relevant documents. This tool helps in answering questions
        from the earnings call transcripts.

        Args:
            question (str): _description_. Question to query the database for relevant documents.
            quarter (str): _description_. the financial quarter that is discussed in the question and possible options are Q1, Q2, Q3, Q4

        Returns:
            str: relevant text along from the earnings call and the speaker who addressed the relevant documents
        """
        assert quarter in ["Q1", "Q2", "Q3", "Q4"], "The quarter should be from Q1, Q2, Q3, Q4"

        req_speaker_list = []
        quarter_speaker_list = quarter_speaker_dict[quarter]

        for sl in quarter_speaker_list:
            if sl in question or sl.lower() in question:
                req_speaker_list.append(sl)
        if len(req_speaker_list) == 0:
            req_speaker_list = quarter_speaker_list

        relevant_docs = earnings_call_db.similarity_search(
            question,
            k=5,
            filter={
                "$and":[
                    {
                        "quarter":{"$eq":quarter}
                    },
                    {
                        "speaker":{"$in":req_speaker_list}
                    }
                ]
            }
        )

        speaker_releavnt_dict = {}
        for doc in relevant_docs:
            speaker = doc.metadata['speaker']
            speaker_text = doc.page_content
            if speaker not in speaker_releavnt_dict:
                speaker_releavnt_dict[speaker] = speaker_text
            else:
                speaker_releavnt_dict[speaker] += " "+speaker_text

        relevant_speaker_text = ""
        for speaker, text in speaker_releavnt_dict.items():
            relevant_speaker_text += speaker + ": "
            relevant_speaker_text += text + "\n\n"

        return relevant_speaker_text




def query_database_markdown_sec(
            question: str,
            sec_form_name: str
    )->str:
  assert sec_form_name in sec_form_names, f'The search form type should be in {sec_form_names}'

  relevant_docs = sec_filings_md_db.similarity_search(
      question,
      k=3,
      filter={
          "filing_type":{"$eq":sec_form_name}
      }
  )
  # print(relevant_docs)
  relevant_section_text = ""
  for relevant_text in relevant_docs:
      relevant_section_text += relevant_text.page_content + "\n\n"

  return relevant_section_text

In [10]:
def query_database_sec(
            question: str,
            sec_form_name: str
    )->str:
        """This tool will query the SEC Filings database for a given question and form name, and it will retrieve
        the relevant text along from the SEC filings and the section names. This tool helps in answering questions
        from the sec filings.

        Args:
            question (str): _description_. Question to query the database for relevant documents
            sec_form_name (str): _description_. SEC FORM NAME that the question is talking about. It can be 10-K for yearly data and 10-Q for quarterly data. For quarterly data, it can be 10-Q2 to represent Quarter 2 and similarly for other quarters.

        Returns:
            str: Relevant context for the question from the sec filings
        """
        if not FROM_MARKDOWN:
          return "No data available"
        elif FROM_MARKDOWN:
          return query_database_markdown_sec(question,sec_form_name)

In [11]:
sec_form_system_msg = ""
for sec_form in sec_form_names:
    if sec_form == "10-K":
        sec_form_system_msg+= "10-K for yearly data, "
    elif "10-Q" in sec_form:
        quarter = sec_form[-1]
        sec_form_system_msg+= f"{sec_form} for Q{quarter} data, "
sec_form_system_msg = sec_form_system_msg[:-2]

earnings_call_system_message = ", ".join(["Q1", "Q2", "Q3", "Q4"])

system_msg = f"""You are a helpful financial assistant and your task is to select the sec_filings or earnings_call or financial_books to best answer the question.
You can use query_database_sec(question,sec_form) by passing question and relevant sec_form names like {sec_form_system_msg}
or you can use query_database_earnings_call(question,quarter) by passing question and relevant quarter names with possible values {earnings_call_system_message}
or you can use query_database_books(question) to get relevant documents from financial textbooks about valuation and investing philosophies. When you are ready to end the coversation, reply TERMINATE"""

#### Start using AutoGen below

In [18]:
from autogen import ConversableAgent
import os
import json
with open('../../config.json', 'r') as file:
    config = json.load(file)


llm_config = {"model":"gpt-3.5-turbo"}

os.environ['OPENAI_API_KEY'] = config['NLP_Project_KEY']
user_proxy = ConversableAgent(
    name = "Planner Admin",
    system_message=system_msg,
    code_execution_config=False,
    llm_config=llm_config,
    human_input_mode="NEVER",
    is_termination_msg=lambda msg: "TERMINATE" in msg["content"],
)
tool_proxy = ConversableAgent(
  name="Tool Proxy",
  system_message="Analyze the response from user proxy and decide whether the suggested database is suitable "
  ". Answer in simple yes or no",
  llm_config=False,
  # is_termination_msg=lambda msg: "exit" in msg.get("content",""),
  default_auto_reply="Please select the right database.",
  human_input_mode="ALWAYS",
  )

tools_dict = {
        "sec":[query_database_sec,"Tool to query SEC filings database"],
        "earnings_call": [query_database_earnings_call, "Tool to query earnings call transcripts database"],
    }

##### register functions

In [19]:
from autogen import register_function

for tool_name,tool in tools_dict.items():
  register_function(
      tool[0],
      caller=user_proxy,
      executor=tool_proxy,
      name = tool[0].__name__,
      description=tool[1]
  )

##### Start querying the agents with task

In [24]:
input_text = "What is the Net Revenue for AMD on March 30, 2024?"
chat_result = user_proxy.initiate_chat(
        recipient=tool_proxy,
        message=input_text,
        max_turns=10
    )

[33mPlanner Admin[0m (to Tool Proxy):

What is the Net Revenue for AMD on March 30, 2024?

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mTool Proxy[0m (to Planner Admin):

Please select the right database.

--------------------------------------------------------------------------------
[33mPlanner Admin[0m (to Tool Proxy):

Could you please specify the specific document or database you would like me to search for the Net Revenue of AMD on March 30, 2024? You can choose from SEC filings, earnings call transcripts, or financial textbooks.

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mTool Proxy[0m (to Planner Admin):

Please select the right database.

--------------------------------------------------------------------------------
[33mP

In [21]:
input_text = "Can you list the competitors of NVIDIA and compare the risk factors between NVIDIA and the others?"
chat_result = user_proxy.initiate_chat(
        recipient=tool_proxy,
        message=input_text,
        max_turns=10
    )

[33mPlanner Admin[0m (to Tool Proxy):

Can you list the competitors of NVIDIA and compare the risk factors between NVIDIA and the others?

--------------------------------------------------------------------------------
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mTool Proxy[0m (to Planner Admin):

Please select the right database.

--------------------------------------------------------------------------------
[33mPlanner Admin[0m (to Tool Proxy):

I will need to query the SEC filings database to retrieve information about NVIDIA and its competitors' risk factors. Let me do that for you.
[32m***** Suggested tool call (call_o8lEujCfifPABWYmgwtmA8gW): query_database_sec *****[0m
Arguments: 
{"question":"List the risk factors for NVIDIA and its competitors","sec_form_name":"10-K"}
[32m***********************************************************************************[0m

-----------------------------------------------------------------