<a href="https://colab.research.google.com/github/winterForestStump/thesis/blob/main/retrieval/Retrievals_l2_with_reranker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Running on GPU

In [None]:
!pip install langchain langchain-core langchain-community --quiet
!pip install sentence_transformers FlagEmbedding chromadb --quiet

In [None]:
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
import chromadb
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage.file_system import LocalFileStore
from FlagEmbedding import FlagReranker
from tqdm import tqdm
import os

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
persistent_client = chromadb.PersistentClient('/content/drive/MyDrive/Thesis/chromadb')
fs = LocalFileStore('/content/drive/MyDrive/Thesis/reports_store_location')
store = create_kv_docstore(fs)

collection = persistent_client.get_or_create_collection('reports_l2')
vectorstore = Chroma(client = persistent_client, collection_name='reports_l2', embedding_function=bge_embeddings,
                    persist_directory='/content/drive/MyDrive/Thesis/chromadb')
vectorstore.persist()

  warn_deprecated(


In [None]:
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

In [None]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=256)

In [None]:
def compute_reranker(company: str, num_parent: int, num_rerank: int):

  '''
  Retrive a lot of parent chunks using company name filter and number of chunks, rerank them with bge-reranker
  '''

  # Get questions
  questions = pd.read_fwf("https://raw.githubusercontent.com/winterForestStump/thesis/main/questions/questions_ver2.txt", names=['question'])

  # Initialize a retriever
  big_chunks_retriever = ParentDocumentRetriever(vectorstore=vectorstore, docstore=store,
                                                 child_splitter=child_splitter, parent_splitter=parent_splitter,
                                                 search_kwargs={'filter': {'company': company}, 'k': num_parent})

  # Invoke retriever without company name in the question
  results_list = []

  for i in tqdm(range(len(questions))):
    approach = f'reranked_{company}'
    query = questions['question'][i]

    response = big_chunks_retriever.invoke(query)

    texts = []
    for j in range(len(response)):
      texts.append([query, response[j].page_content])
    scores = reranker.compute_score(texts)
    combined = list(zip(texts, scores))
    sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)

    top_texts = [item[0] for item in sorted_combined[:num_rerank]]

    results_list.append(pd.DataFrame({
        'question': query,
        'context': [top_texts]
    }))
  results = pd.concat(results_list, ignore_index=True)

  results.to_json(f'/content/drive/MyDrive/Thesis/retrievals/reranked/results_{approach}.json')

In [None]:
companies = ['COCA COLA CO', 'AMAZON COM INC', 'PayPal Holdings, Inc.', 'GENERAL MILLS INC', 'Walmart Inc.', 'PEPSICO INC',
             'Kraft Heinz Co', 'Amcor plc', 'Square, Inc.', '3M CO', 'MICROSOFT CORP', 'Ulta Beauty, Inc.', 'AES CORP']

for comp in companies:
  compute_reranker(comp, 10, 2)

100%|██████████| 35/35 [02:59<00:00,  5.14s/it]
100%|██████████| 35/35 [02:01<00:00,  3.46s/it]
100%|██████████| 35/35 [01:52<00:00,  3.21s/it]
100%|██████████| 35/35 [02:17<00:00,  3.94s/it]
100%|██████████| 35/35 [02:21<00:00,  4.06s/it]
100%|██████████| 35/35 [02:17<00:00,  3.92s/it]
100%|██████████| 35/35 [01:52<00:00,  3.21s/it]
100%|██████████| 35/35 [02:30<00:00,  4.30s/it]
100%|██████████| 35/35 [02:18<00:00,  3.96s/it]
100%|██████████| 35/35 [02:38<00:00,  4.52s/it]
100%|██████████| 35/35 [02:08<00:00,  3.66s/it]
100%|██████████| 35/35 [01:42<00:00,  2.94s/it]
100%|██████████| 35/35 [01:54<00:00,  3.27s/it]


In [None]:
folder_path = '/content/drive/MyDrive/Thesis/retrievals/reranked/'

dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):
        approach_name = file_name
        dataframes.append(approach_name)

In [None]:
len(dataframes)

13