In [None]:
!docker run -p "6333:6333" -p "6334:6334" --name "dsa4265-qdrant" --rm -d qdrant/qdrant:latest

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import cohere
from dotenv import load_dotenv
import os
import nltk
nltk.download('punkt')  # Download the tokenizer models

from nltk.tokenize import word_tokenize
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# Load environment variables from .env file
load_dotenv()

# Retrieve the API key from the environment
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
load_dotenv()

# Retrieve the API key from the environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


client_gpt = OpenAI(
  api_key=OPENAI_API_KEY,  # this is also the default, it can be omitted
)

In [None]:
co = cohere.ClientV2(COHERE_API_KEY)
client = QdrantClient(url="http://localhost:6333")
client.get_collections()

In [4]:
client.create_collection(
    collection_name="summarizer_test3",
    vectors_config=VectorParams(size=1024, distance=Distance.DOT),
)

True

In [None]:
file_path = '/home/sarahgohrazer/Projects/Stocks-MultiAgent/aapl_10k_forms/apple_filings_text/000032019324000123-aapl-20240928.txt' 
with open(file_path, 'r') as file:
    text = file.read()

# Divide the text into chunks
chunk_size = 3000  # Adjust this size as needed
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [7]:
model="embed-english-v3.0"

doc_embeddings = co.embed(texts=chunks,
                          model=model,
                          input_type="search_document",
                          embedding_types=['float'])

In [8]:
query ="""
what is the ticker symbol of this company?
"""

In [22]:
points = []
for idx, (embedding, doc) in enumerate(zip(doc_embeddings.embeddings.float_, chunks)):
    point = PointStruct(
        id=idx,
        vector=embedding,
        payload={"document": doc}
    )
    points.append(point)

In [11]:
operation_info = client.upsert(
    collection_name="summarizer_test3",
    points=points
)

In [9]:
query_embeddings = co.embed(texts=[query],
                          model=model,
                          input_type="search_query",
                          embedding_types=['float'])

In [10]:
search_result = client.query_points(
    collection_name="summarizer_test3", query=query_embeddings.embeddings.float_[0], limit=10
).points

In [None]:
for hit in search_result:
    print(f"ID: {hit.id}, Document: {hit.payload['document']}, Score: {hit.score}")

ID: 5, Document: umber, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading symbol(s)
Name of each exchange on which registered
Common Stock, $0.00001 par value per share
AAPL
The Nasdaq Stock Market LLC
0.000% Notes due 2025
—
The Nasdaq Stock Market LLC
0.875% Notes due 2025
—
The Nasdaq Stock Market LLC
1.625% Notes due 2026
—
The Nasdaq Stock Market LLC
2.000% Notes due 2027
—
The Nasdaq Stock Market LLC
1.375% Notes due 2029
—
The Nasdaq Stock Market LLC
3.050% Notes due 2029
—
The Nasdaq Stock Market LLC
0.500% Notes due 2031
—
The Nasdaq Stock Market LLC
3.600% Notes due 2042
—
The Nasdaq Stock Market LLC
Securities registered pursuant to Section 12(g) of the Act:  None
Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
Yes
☒
No
☐
Indicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.
Y

In [12]:
document_list = [point.payload['document'] for point in search_result]

document_list

['umber, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading symbol(s)\nName of each exchange on which registered\nCommon Stock, $0.00001 par value per share\nAAPL\nThe Nasdaq Stock Market LLC\n0.000% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n0.875% Notes due 2025\n—\nThe Nasdaq Stock Market LLC\n1.625% Notes due 2026\n—\nThe Nasdaq Stock Market LLC\n2.000% Notes due 2027\n—\nThe Nasdaq Stock Market LLC\n1.375% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n3.050% Notes due 2029\n—\nThe Nasdaq Stock Market LLC\n0.500% Notes due 2031\n—\nThe Nasdaq Stock Market LLC\n3.600% Notes due 2042\n—\nThe Nasdaq Stock Market LLC\nSecurities registered pursuant to Section 12(g) of the Act:  None\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\nYes\n☒\nNo\n☐\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Sect

checking token limit, max 4096 tokens in each document

In [None]:
# Assuming document_list contains text-based documents (strings)
tokens_per_document = [len(word_tokenize(doc)) for doc in document_list]

# Printing the number of tokens for each document
print(tokens_per_document)

## max 4096 tokens in each document 

[493, 525, 506, 491, 497, 511, 498, 509, 484, 488]


[nltk_data] Downloading package punkt to
[nltk_data]     /home/sarahgohrazer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
rerank_results = co.rerank(
    model="rerank-english-v3.0",
    query=query,
    documents=document_list,
    top_n=3,
    return_documents=True
)

In [17]:
# print(dir(rerank_results))
top_documents = [item.document.text for item in rerank_results.results]
print(top_documents)
print(len(top_documents))

['any has opposed, is pending before the California District Court. On September 30, 2024, the Company filed a motion with the California District Court to narrow or vacate the injunction. The Company believes it has substantial defenses and intends to vigorously defend itself.\nOther Legal Proceedings\nThe Company is subject to other legal proceedings and claims that have not been fully resolved and that have arisen in the ordinary course of business. The Company settled certain matters during the fourth quarter of 2024 that did not individually or in the aggregate have a material impact on the Company’s financial condition or operating results. The outcome of litigation is inherently uncertain. If one or more legal matters were resolved against the Company in a reporting period for amounts above management’s expectations, the Company’s financial condition and operating results for that reporting period could be materially adversely affected.\nItem 4.\xa0\xa0\xa0\xa0Mine Safety Disclo

send to open ai for query

In [20]:
# Combine the top documents into a single prompt
combined_documents = "\n\n".join(top_documents[:3])  # Use top 3 or however many you want

# Prepare the prompt for the combined documents
prompt = f"Query: {query}\n\nDocuments: {combined_documents}\n\nAnswer:"

# Generate a response using the OpenAI GPT model
response = client_gpt.responses.create(
    model="gpt-3.5-turbo",  # Use the desired OpenAI model
    input=prompt,  # The prompt for the LLM
)

# Output the answer
print(response.output_text.strip())


The ticker symbol of the company mentioned in the document is AAPL.
