In [88]:
from dotenv import load_dotenv
import os
import cohere
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain_cohere import CohereEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from json import loads
from langchain_core.output_parsers import JsonOutputParser
from typing import Dict, List, Optional, Any, Tuple
from pydantic import BaseModel, Field

In [87]:
load_dotenv()
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
cohere_client = cohere.ClientV2(COHERE_API_KEY)
cohere_model = "embed-english-v3.0"

openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(
    temperature=0, model="gpt-4o", api_key=openai_api_key
)

qdrant_client = QdrantClient(url="http://localhost:6333")
collection_names = [
            "financial_news",
            "earnings_calls",
            "aapl_10k_10q_forms",
        ]

financial_entities: List[Dict] = Field(
        default_factory=list, description="Extracted financial entities"
    )
output_parser = JsonOutputParser(pydantic_object=financial_entities)

In [38]:
query ="""
what is the ticker symbol of this company?
"""

In [39]:
prompt = ChatPromptTemplate.from_template(
                """You are a financial data assistant that helps determine which data source to query.
                
                Available collections:
                - financial_news: Recent financial news articles
                - aapl_10k_10q_forms: SEC filings including 10-K and 10-Q forms
                - earnings_calls: Transcripts from company earnings calls
                
                User query: {query}
                
                Based on this query, which ONE collection should I search to provide the most relevant information?
                Reply with ONLY ONE of: "financial_news", "aapl_10k_10q_forms", or "earnings_calls".
                """
            )

chain = prompt | llm | StrOutputParser()
collection = chain.invoke({"query": query})

print(collection)

# Validate the collection name
if collection not in collection_names:
    collection = collection_names[0]  # Default to first collection

# Update state
collection_choice = collection

aapl_10k_10q_forms


In [113]:
query_embeddings = cohere_client.embed(
                texts=[query],
                model=cohere_model,
                input_type="search_query",
                embedding_types=["float"],
            )

response = qdrant_client.query_points(
                collection_name=collection_choice,
                query=query_embeddings.embeddings.float_[0],
                limit=10,
                with_payload = True,
                with_vectors = False,
            ).points

docs = []
for point in response:
    print(point.payload.keys())
    content = point.payload.get("document", "")
    metadata = {k: v for k, v in point.payload.items() if k not in ["document"]}
    docs.append(Document(page_content=content, metadata=metadata))

context = docs
source_documents = [
            {"content": doc.page_content or "No content available", 
             "metadata": doc.metadata}
            for doc in docs
        ]

for i in source_documents:
    print(i['metadata']['file_name'])

dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
dict_keys(['document', 'file_name', 'chunk_index'])
000032019324000123-aapl-20240928.txt
000032019324000069-aapl-20240330.txt
000032019324000123-aapl-20240928.txt
000032019324000081-aapl-20240629.txt
000032019323000106-aapl-20230930.txt
000032019324000081-aapl-20240629.txt
000032019324000069-aapl-20240330.txt
000032019324000069-aapl-20240330.txt
000032019323000106-aapl-20230930.txt
000032019324000123-aapl-20240928.txt


In [63]:
context_text = "\n\n".join([doc.page_content for doc in context])
print(context_text)

umber, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading symbol(s)
Name of each exchange on which registered
Common Stock, $0.00001 par value per share
AAPL
The Nasdaq Stock Market LLC
0.000% Notes due 2025
—
The Nasdaq Stock Market LLC
0.875% Notes due 2025
—
The Nasdaq Stock Market LLC
1.625% Notes due 2026
—
The Nasdaq Stock Market LLC
2.000% Notes due 2027
—
The Nasdaq Stock Market LLC
1.375% Notes due 2029
—
The Nasdaq Stock Market LLC
3.050% Notes due 2029
—
The Nasdaq Stock Market LLC
0.500% Notes due 2031
—
The Nasdaq Stock Market LLC
3.600% Notes due 2042
—
The Nasdaq Stock Market LLC
Securities registered pursuant to Section 12(g) of the Act:  None
Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
Yes
☒
No
☐
Indicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.
Yes
☐
No
☒
Indicat

In [89]:
context_text = "\n\n".join([doc.page_content for doc in context])

prompt = ChatPromptTemplate.from_template(
    """You are a financial entity extraction specialist.
    
    Extract key financial entities from the following financial text:
    
    {context}
    
    Extract and return a JSON array of objects with the following properties:
    - entity_type: The type of entity (e.g., company, metric, stock_symbol, financial_term, person, date)
    - entity_name: The name of the entity
    - value: Any associated value or metric (if applicable)
    If null, return as "".
    
    Format your response as a valid JSON array, nothing else.
    """
)

chain = prompt | llm | StrOutputParser()
result = chain.invoke({"context": context_text})
entities = output_parser.parse(result)

# Update state
financial_entities = entities

In [109]:
print(financial_entities)

[{'entity_type': 'company', 'entity_name': 'Apple Inc.', 'value': ''}, {'entity_type': 'stock_symbol', 'entity_name': 'AAPL', 'value': ''}, {'entity_type': 'exchange', 'entity_name': 'The Nasdaq Stock Market LLC', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': 'Common Stock', 'value': '$0.00001 par value per share'}, {'entity_type': 'financial_term', 'entity_name': '0.000% Notes due 2025', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '0.875% Notes due 2025', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '1.625% Notes due 2026', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '2.000% Notes due 2027', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '1.375% Notes due 2029', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '3.050% Notes due 2029', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': '0.500% Notes due 2031', 'value': ''}, {'entity_type': 'financial_term', 'entity_name': 

In [117]:
context_snippets = [f"Filename {doc.metadata['file_name']}:\n{doc.page_content}\n" 
                           for i, doc in enumerate(context)]

context_snippets

['Filename 000032019324000123-aapl-20240928.txt:\number, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading symbol(s)\nName of each exchange on which registered\nCommon Stock, $0.00001 par value per share\nAAPL\nThe Nasdaq Stock Market LLC\n0.000% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n0.875% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n1.625% Notes due 2026\n—\nThe Nasdaq Stock Market LLC\n2.000% Notes due 2027\n—\nThe Nasdaq Stock Market LLC\n1.375% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n3.050% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n0.500% Notes due 2031\n—\nThe Nasdaq Stock Market LLC\n3.600% Notes due 2042\n—\nThe Nasdaq Stock Market LLC\nSecurities registered pursuant to Section 12(g) of the Act:  None\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\nYes\n☒\nNo\n☐\nIndicate by check mark if the Registrant is not require

In [119]:
print(source_documents)

[{'content': 'umber, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading symbol(s)\nName of each exchange on which registered\nCommon Stock, $0.00001 par value per share\nAAPL\nThe Nasdaq Stock Market LLC\n0.000% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n0.875% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n1.625% Notes due 2026\n—\nThe Nasdaq Stock Market LLC\n2.000% Notes due 2027\n—\nThe Nasdaq Stock Market LLC\n1.375% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n3.050% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n0.500% Notes due 2031\n—\nThe Nasdaq Stock Market LLC\n3.600% Notes due 2042\n—\nThe Nasdaq Stock Market LLC\nSecurities registered pursuant to Section 12(g) of the Act:  None\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\nYes\n☒\nNo\n☐\nIndicate by check mark if the Registrant is not required to file reports pursuant to Sectio

In [118]:
context_snippets = [f"File {doc.metadata['file_name']}:\n{doc.page_content}\n" 
                           for i, doc in enumerate(context)]
        
context_text = "\n".join(context_snippets)

# Format entity information
entity_info = ""
if financial_entities:
    entity_info = "Key entities identified:\n"
    for entity in financial_entities:
        entity_info += f"- {entity.get('entity_name', 'Unknown')} ({entity.get('entity_type', 'Unknown')})"
        if entity.get('value'):
            entity_info += f": {entity.get('value')}"
        if entity.get('sentiment'):
            entity_info += f" [{entity.get('sentiment')}]"
        entity_info += "\n"

prompt = ChatPromptTemplate.from_template(
    """You are a financial analysis assistant that provides accurate information based on the retrieved documents.
    
    User query: {query}

    Collection choice: {collection_choice}
    
    Retrieved information:
    {context}
    
    {entity_info}
    
    Based on the retrieved information, provide a comprehensive response to the user's query.
    Be specific and cite information from the documents where appropriate.
    If the information is not sufficient to answer the query completely, acknowledge the limitations.
    
    Response:
    """
)

chain = prompt | llm | StrOutputParser()
response = chain.invoke({
    "query": query,
    "collection_choice": collection_choice,
    "context": context_text,
    "entity_info": entity_info
})

# Update state
rag_response = response

print(rag_response)

The ticker symbol for Apple Inc. is "AAPL." This information is found in the retrieved documents, specifically in the sections listing securities registered pursuant to Section 12(b) of the Securities Exchange Act. The common stock of Apple Inc., with a par value of $0.00001 per share, is traded under the symbol "AAPL" on The Nasdaq Stock Market LLC.
