In [None]:
# ! gdown 1yVbhJWh4L1unDbDT4APOusTXlwic7aE9
# ! gdown 1-F1DO6UNkz3ndjSV4kSzu7zcaQJtf5De
# ! gdown 1LouWfzlgol2J2EdqFy0YVIPh93uSq2ON

In [1]:
import os 
from dotenv import load_dotenv # python -m pip install python-dotenv
load_dotenv()

True

In [2]:
from langchain.document_loaders import PyPDFLoader 
# pip install --upgrade langchain
# pip install -U langchain-community
# pip install pypdf

In [3]:
loaders = [
    PyPDFLoader("2022_Annual_Report.pdf"),
    # PyPDFLoader("SQL Revision Notes.pdf"),
    PyPDFLoader("2022_Annual_Report.pdf"),
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1500,
    chunk_overlap=150,
)

In [5]:
splits = text_splitter.split_documents(docs)

In [7]:
# Embeddings

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# pip install -U sentence-transformers

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# from langchain_openai import AzureOpenAIEmbeddings

# embedding = AzureOpenAIEmbeddings(
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"),
#     azure_endpoint=os.getenv("AZURE_BASE_URL"),
#     api_version = "2023-03-15-preview",
#     model="text-embedding-3-small",
#     )

  from tqdm.autonotebook import tqdm, trange


In [None]:
from langchain_community.vectorstores import FAISS # pip install faiss-cpu

db = FAISS.from_documents(
    splits,
    embedding
)

In [None]:
query = "what is discussed about metrics "
docs = db.similarity_search_with_score(query)

In [None]:
docs

In [None]:
docs_mmr = db.max_marginal_relevance_search(query, k=4)
docs_mmr

In [9]:
loaders = [
    PyPDFLoader("PA - Consolidated lecture notes.pdf"),
    PyPDFLoader("SQL Revision Notes.pdf"),
    PyPDFLoader("2022_Annual_Report.pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

splits = text_splitter.split_documents(docs)

split_texts = [str(doc.page_content) for doc in splits]

In [10]:
len(split_texts)

332

In [12]:
import chromadb
from chromadb.utils.embedding_functions import create_langchain_embedding

ef = create_langchain_embedding(embedding)

In [13]:
chroma_client = chromadb.Client()
client = chromadb.PersistentClient(path="database")

chroma_collection = chroma_client.create_collection("chroma_check_1", embedding_function=ef)
# This method creates a new collection within ChromaDB.
# Any document added to this collection will be processed using the ef function to generate its embedding.

In [14]:
ids = [str(i) for i in range(len(split_texts))] #creating ids for all splits

chroma_collection.add(ids=ids, documents=split_texts)
chroma_collection.count() # verify that all the documents were added successfully.


332

In [15]:
query = "What was the total revenue?"

results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print((document))
    print('\n')

47 FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA  
INCOME STATEMENTS  
  
(In millions, except per share amounts)          
        
Year Ended June  30, 2022  2021  2020  
        
Revenue:        
Product  $    72,732   $    71,074   $    68,041   
Service and other   125,538    97,014    74,974         
Total revenue   198,270    168,088    143,015         
Cost of revenue:        
Product   19,064    18,219    16,017   
Service and other   43,586    34,013    30,061         
Total cost of revenue   62,650    52,232    46,078         
Gross margin   135,620    115,856    96,937   
Research and development   24,512    20,716    19,269   
Sales and marketing   21,825    20,117    19,598   
General and administrative   5,900    5,107    5,111         
Operating income   83,383    69,916    52,959   
Other income, net   333   1,186    77        
Income before income taxes   83,716    71,102    53,036   
Provision for income taxes   10,978    9,831    8,755         
Net income  $ 72,738   

In [16]:
## Using open source llama 3 model from TogetherAI 

from langchain_together import ChatTogether

llm = ChatTogether(
    api_key = os.getenv("TOGETHER_API_KEY"),
    temperature=0.0, 
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
)

In [17]:
def rag(query, retrieved_documents, llm=llm):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

    response = llm.invoke(messages)

    content = response.content
    return content

In [18]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print((output))

The total revenue for the year ended June 30, 2022 was $198,270 million.


In [20]:
# Cross-Encoder, Re-Ranking

query = "What all areas are explored for AI"

results = chroma_collection.query(
    query_texts=query,
    n_results=8,
    include=["documents", "embeddings"])

retrieved_documents = results["documents"][0]

for document in retrieved_documents:
    print(document)
    print("")

______________________________________________________________________________

4   
Our commitment to responsibly develop and use technologies like AI is core to who we are. We put our commitment into 
practice, not only within Microsoft but by empowering our custom ers and partners to do the same and by advocating for 
policy change. We released our Responsible AI Standard, which outlines 17 goals aligned to our six AI principles and 
includes tools and practices to support them. And we share our open -source tools, inc luding the new Responsible AI 
Dashboard, to help developers building AI technologies identify and mitigate issues before deployment.  
Finally, we provide clear reporting and information on how we run our business and how we work with customers and 
partners , delivering the transparency that is central to trust. Our annual Impact Summary shares more about our progress 
and learnings across these four commitments, and our Reports Hub  provides detailed reports on our

In [21]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction 
from sentence_transformers import CrossEncoder # ! pip install -U sentence-transformers

cross_encoder = CrossEncoder("BAAI/bge-reranker-v2-m3")

In [22]:
pairs = [[query, d_i] for d_i in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.000108078624
0.00054811133
1.6217e-05
0.00055333
0.00016040505
0.0022564156
0.00022766506
1.62869e-05


In [23]:
import numpy as np
print("New Ordering:")
new_ord=[]
for o in np.argsort(scores)[::-1]:
    print(o)
    new_ord.append(o)

New Ordering:
5
3
1
6
4
0
7
2


In [24]:
information=""
i=0
for n in new_ord:
    print(n)
    information = "\n\n".join(retrieved_documents[n])
    i+=1
    if(i==4):
        break #taking the first 4 out of 8

5
3
1
6


In [25]:
messages = [
        {
            "role": "system",
            "content": "You are a helpful expert AI research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

response = llm.invoke(messages).content

In [27]:
print(response)

The areas explored for AI include:

1. Offering insights into user behavior in the app, such as user spending time on help center articles.
2. Proposing the setup of A/B testing to compare user behavior and satisfaction between users with and without access to the new feature.
3. Agreeing to work on making data-driven decisions and collecting relevant data.
4. Judgment Criteria & General Framework, including:
   - Keeping this in mind when addressing business acumens questions.
   - Judgment Criteria for interviewers, including:
     • Structure - Demonstrate a systematic approach.
     • Comprehensive - Covers all important aspects.
     • Feasibility - Practical enough that it could be implemented realistically.
5. General Framework to keep in mind, including:
   - Clarify
   - Plan
   - Conclude


In [28]:
# Query Expansion
def augment_multiple_query(query, llm=llm):
    messages = [
            {
                "role": "system",
                "content": "You are a helpful expert financial research assistant. Your users are asking questions about an annual report. "
                "Suggest up to four additional related questions to help them find the information they need, for the provided question. "
                "Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic."
                "Make sure they are complete questions, and that they are related to the original question."
                "Output one question per line. Do not number the questions."
            },
            {"role": "user", "content": query}
        ]

    response = llm.invoke(messages).content
    response = response.split("\n")
    return response


In [63]:
# original_query = "What has been the investment in AI research?"
original_query = "What all areas are explored for AI"
augmented_queries = augment_multiple_query(original_query)

for query in augmented_queries:
    print(query)

What are the company's AI research and development expenses?
Are there any AI-related partnerships or collaborations mentioned?
How does the company plan to integrate AI into its products or services?
What are the potential risks or challenges associated with the company's AI initiatives?


In [54]:
queries = [original_query] + augmented_queries

len(queries)

5

In [64]:
retrieved_documents = []
for query in queries:
    results = chroma_collection.query(query_texts=query, n_results=6, include=['documents', 'embeddings'])
    retrieved_documents.extend(results['documents'][0])

len(retrieved_documents)

30

In [65]:
pairs = [[original_query, d_i] for d_i in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.004051035
0.0006815926
0.0022564137
0.02120277
2.78124e-05
0.00054810924
1.7655413e-05
1.8816341e-05
1.6695913e-05
0.00014322968
4.0857547e-05
2.048527e-05
2.78124e-05
1.7259448e-05
1.8152707e-05
2.1308628e-05
1.900034e-05
1.707421e-05
0.0005533279
0.00054810924
1.715824e-05
0.0022564137
0.0020823614
8.428457e-05
1.7259448e-05
2.78124e-05
1.609909e-05
1.6921736e-05
1.7714308e-05
1.7210403e-05


In [66]:
import numpy as np
print("New Ordering:")
new_ord=[]
for o in np.argsort(scores)[::-1]:
    print(o)
    new_ord.append(o)


New Ordering:
3
0
21
2
22
1
18
19
5
9
23
10
12
25
4
15
11
16
7
14
28
6
13
24
29
20
17
27
8
26


In [70]:
information=""
i=0
for n in new_ord:
  print(n)
  information = "\n\n".join(retrieved_documents[n])
  i+=1
  if(i==20):
    break #taking the first 4 out of 8

3
0
21
2
22
1
18
19
5
9
23
10
12
25
4
15
11
16
7
14


In [71]:
messages = [
        {
            "role": "system",
            "content": "You are a helpful expert AI research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {original_query}. \n Information: {information}"}
    ]

response = llm.invoke(messages).content

In [72]:
print(response)

The areas explored for AI include:

1. Technology-based
2. Customer-related
3. Marketing-related
4. Contract-based
