<a href="https://colab.research.google.com/github/winterForestStump/thesis/blob/main/retrieval/Retrievals_approaches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Running on CPU

In [2]:
!pip install langchain langchain-core langchain-community --quiet
!pip install sentence_transformers FlagEmbedding chromadb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.8/122.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [3]:
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
import chromadb
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage.file_system import LocalFileStore
from tqdm import tqdm
import os

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
persistent_client = chromadb.PersistentClient('/content/drive/MyDrive/Thesis/chromadb')
fs = LocalFileStore('/content/drive/MyDrive/Thesis/reports_store_location')
store = create_kv_docstore(fs)

In [7]:
def compute_approaches(company: str, distance: str, num: int):

  '''
  Retrive parent chunks using company name filter, distance/ similarity metric, number of chunks parameters
  '''

  # Get questions
  questions = pd.read_fwf("https://raw.githubusercontent.com/winterForestStump/thesis/main/questions/questions_ver2.txt", names=['question'])

  # Create new column with the company name in the question
  questions['question_name'] = questions['question'].str.replace('company', company)

  # Initialize collection and ChromaDB vectorstore
  collection = persistent_client.get_or_create_collection(f'reports_{distance}')
  vectorstore = Chroma(client = persistent_client, collection_name=f'reports_{distance}', embedding_function=bge_embeddings,
                       persist_directory='/content/drive/MyDrive/Thesis/chromadb')
  vectorstore.persist()

  # Initialize a retriever
  parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
  child_splitter = RecursiveCharacterTextSplitter(chunk_size=256)
  big_chunks_retriever = ParentDocumentRetriever(vectorstore=vectorstore, docstore=store,
                                                 child_splitter=child_splitter, parent_splitter=parent_splitter,
                                                 search_kwargs={'filter': {'company': company}, 'k': num})

  # Invoke retriever without company name in the question
  results_list = []
  for i in tqdm(range(len(questions))):
    approach = f'approach_{company}_{distance}_1'
    response = big_chunks_retriever.invoke(questions['question'][i])
    results_list.append(pd.DataFrame({
        'question': questions['question'][i],
        distance: response
    }))
  results = pd.concat(results_list, ignore_index=True)
  results.to_json(f'/content/drive/MyDrive/Thesis/retrievals/results_{approach}.json')

  # Invoke retriever with company name in the question
  results_list = []
  for i in tqdm(range(len(questions))):
    approach = f'approach_{company}_{distance}_2'
    response = big_chunks_retriever.invoke(questions['question_name'][i])
    results_list.append(pd.DataFrame({
        'question_name': questions['question_name'][i],
        distance: response
    }))
  results = pd.concat(results_list, ignore_index=True)
  results.to_json(f'/content/drive/MyDrive/Thesis/retrievals/results_{approach}.json')

In [8]:
distances = ['cosine', 'ip', 'l2']
companies = ['COCA COLA CO', 'AMAZON COM INC', 'PayPal Holdings, Inc.', 'GENERAL MILLS INC', 'Walmart Inc.', 'PEPSICO INC',
             'Kraft Heinz Co', 'Amcor plc', 'Square, Inc.', '3M CO', 'MICROSOFT CORP', 'Ulta Beauty, Inc.', 'AES CORP']

for comp in companies:
  for d in distances:
    compute_approaches(comp, d, 2)

  warn_deprecated(
100%|██████████| 35/35 [07:07<00:00, 12.21s/it]
100%|██████████| 35/35 [02:30<00:00,  4.29s/it]
100%|██████████| 35/35 [02:02<00:00,  3.51s/it]
100%|██████████| 35/35 [01:34<00:00,  2.70s/it]
100%|██████████| 35/35 [03:33<00:00,  6.09s/it]
100%|██████████| 35/35 [02:12<00:00,  3.79s/it]
100%|██████████| 35/35 [02:09<00:00,  3.71s/it]
100%|██████████| 35/35 [01:35<00:00,  2.74s/it]
100%|██████████| 35/35 [01:23<00:00,  2.39s/it]
100%|██████████| 35/35 [01:17<00:00,  2.20s/it]
100%|██████████| 35/35 [01:19<00:00,  2.27s/it]
100%|██████████| 35/35 [01:21<00:00,  2.34s/it]
100%|██████████| 35/35 [01:54<00:00,  3.26s/it]
100%|██████████| 35/35 [01:24<00:00,  2.41s/it]
100%|██████████| 35/35 [01:19<00:00,  2.27s/it]
100%|██████████| 35/35 [01:12<00:00,  2.07s/it]
100%|██████████| 35/35 [01:20<00:00,  2.30s/it]
100%|██████████| 35/35 [01:24<00:00,  2.41s/it]
100%|██████████| 35/35 [02:18<00:00,  3.96s/it]
100%|██████████| 35/35 [01:47<00:00,  3.07s/it]
100%|██████████| 35/3

In [9]:
folder_path = '/content/drive/MyDrive/Thesis/retrievals/'

dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):
        approach_name = file_name
        dataframes.append(approach_name)

In [10]:
len(dataframes)

78