# UTILS:

In [None]:
! pip install pypdf langchain sentence-transformers chromadb openai umap-learn -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import umap
import numpy as np
from pprint import pprint

In [None]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction


def load_chroma(pdf_file, collection_name, embedding_function):

    reader = PdfReader(pdf_file)
    pdf_texts = [p.extract_text().strip() for p in reader.pages]

    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0
    )

    character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))


    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    chroma_client = chromadb.Client() # Default chroma client
    chroma_collection = chroma_client.create_collection(collection_name,
                                                        embedding_function=embedding_function)

    ids = [str(i) for i in range(len(token_split_texts))] # ids for each txt chunk
    chroma_collection.add(ids=ids, documents=token_split_texts)

    return chroma_collection

# 1. INSTANTIATE LLM:

In [None]:
import os
from google.colab import userdata
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["HUGGINGFACE_API_KEY"] = userdata.get('HF_TOKEN')

openai_client = OpenAI()

# 2. CREATE CHROMA DB:

In [None]:
data_path = 'drive/My Drive/DATA/advanced_RAG/'

In [None]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(data_path + 'microsoft_annual_report_2022.pdf',
                                collection_name='microsoft_annual_report_2022',
                                embedding_function=embedding_function)
chroma_collection.count()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

349

# 3. EXAMINING THE RETRIEVED DOCS:

In [None]:
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query,
                                  n_results=10,
                                  include=['documents', 'embeddings'])
k = 3
retrieved_documents = results['documents'][0]

for document in retrieved_documents[:k]:
    pprint(document)
    print('')

('• operating expenses increased $ 1. 5 billion or 14 % driven by investments '
 'in gaming, search and news advertising, and windows marketing. operating '
 'expenses research and development ( in millions, except percentages ) 2022 '
 '2021 percentage change research and development $ 24, 512 $ 20, 716 18 % as '
 'a percent of revenue 12 % 12 % 0ppt research and development expenses '
 'include payroll, employee benefits, stock - based compensation expense, and '
 'other headcount - related expenses associated with product development. '
 'research and development expenses also include third - party development and '
 'programming costs, localization costs incurred to translate software for '
 'international markets, and the amortization of purchased software code and '
 'services content. research and development expenses increased $ 3. 8 billion '
 'or 18 % driven by investments in cloud engineering, gaming, and linkedin. '
 'sales and marketing')

('competitive in local markets an

# 4. CROSS ENCODER RE-RANKING:

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.9869343
2.644576
-0.26802915
-10.731591
-7.7066054
-5.6469965
-4.2970366
-10.933233
-7.038429
-7.324693


In [None]:
print("New Ordering:")
for i in np.argsort(scores)[::-1]:
    print(i)

New Ordering:
1
0
2
6
5
8
9
4
3
7


# 5. RE-RANKING WITH QUERY EXPANSION:

In [None]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [None]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries,
                                  n_results=10,
                                  include=['documents', 'embeddings'])

retrieved_documents = results['documents']

In [None]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [None]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [None]:
scores = cross_encoder.predict(pairs)

In [None]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-9.80788
-3.7948625
-9.357723
-7.9171767
-7.4906545
-10.711212
-4.818485
-6.9020905
-1.1369967
-11.0792675
-4.341767
-4.6518927
-11.041576
-10.0839405
-7.754099
-10.148884
-3.7681541
-9.768025
-9.918428
-8.505107
-5.1418314
-5.2747507
-10.042843
-10.000139


In [None]:
print("New Ordering:")
for i in np.argsort(scores)[::-1]:
    print(i)

New Ordering:
8
16
1
10
11
6
20
21
7
4
14
3
19
2
17
0
18
23
22
13
15
5
12
9
