## `Checking whether my runtime has GPU or not.`

In [None]:
!pip install GPUtil chromadb -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import GPUtil
# Check for GPU availability
gpus = GPUtil.getGPUs()
if gpus:
    for gpu in gpus:
        print(f"GPU ID: {gpu.id}, Name: {gpu.name}, Load: {gpu.load*100}%, Free Memory: {gpu.memoryFree}MB")
    device = "cuda:0"  # Use GPU if available
else:
    print("No NVIDIA GPU detected. Using CPU.")
    device = "cpu"  # Use CPU if no GPU

GPU ID: 0, Name: Tesla T4, Load: 0.0%, Free Memory: 14860.0MB


## `Testing whether my runtime is using GPU or not.`

In [None]:
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda:0')

sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences, convert_to_tensor=True)

print(embeddings.device) # should print cuda:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

cuda:0


## `Embeddings Generation and Querying.`

In [None]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import time

In [None]:
# Model name
model_name = "all-MiniLM-L6-v2"

In [None]:
# Dataframe path
PATH = r'/content/drive/MyDrive/search_engine/files/subtitles_extracted.parquet'

# Load the dataframe
df = pd.read_parquet(PATH)

# Sample the dataframe
df_sampled = df.sample(frac=0.3, random_state=42)

df_sampled.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 466282 entries, 1320107 to 1294123
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   chunk     466282 non-null  object
 1   metadata  466282 non-null  object
dtypes: object(2)
memory usage: 338.8 MB


In [None]:
def generate_embeddings(texts, model_name, batch_size=64, device="cuda:0"):
    """Generates embeddings for a list of texts."""
    model = SentenceTransformer(model_name, device=device)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    return embeddings

In [None]:
def store_embeddings_chroma(chunked_df, collection_name="subtitle_chunks", persist_directory="./chroma_db", batch_size=64):
    """Stores embeddings in ChromaDB with persistence."""
    start_time = time.time()
    client = chromadb.PersistentClient(path=persist_directory)
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

    try:
        client.delete_collection(name=collection_name)
    except ValueError:
        pass  # Collection does not exist

    collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

    for i in range(0, len(chunked_df), batch_size):
        batch_df = chunked_df.iloc[i:i + batch_size]
        texts = batch_df["chunk"].tolist()
        metadatas = batch_df["metadata"].tolist()
        ids = [str(idx) for idx in batch_df.index.tolist()]
        collection.add(documents=texts, metadatas=metadatas, ids=ids)
    end_time = time.time()
    print(f"Time taken to store embeddings: {end_time - start_time} seconds")
    return collection

In [None]:
def retrieve_and_display_results(query, collection, df, n_results=5, model_name="all-MiniLM-L6-v2", device="cuda:0"):
    """Retrieves top results and displays them with URLs."""
    model = SentenceTransformer(model_name, device=device)
    query_embedding = model.encode([query], show_progress_bar=False).tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=n_results, include=["metadatas"])

    for i, metadata in enumerate(results["metadatas"][0]):
        subtitle_name = metadata["subtitle_name"]
        subtitle_id = metadata["subtitle_id"]
        url = f"https://www.opensubtitles.org/en/subtitles/{subtitle_id}"
        print(f"Result {i + 1}:")
        print(f"  Subtitle Name: {subtitle_name.upper()}")
        print(f"  URL: {url}")
        print("-" * 20)

In [None]:
# Example Usage
persist_directory = r"/content/drive/MyDrive/search_engine/db/"
collection = store_embeddings_chroma(df_sampled, persist_directory=persist_directory)
query = "What is the meaning of life?"
retrieve_and_display_results(query=query, collection=collection, df=df_sampled, model_name=model_name, device=device) #uses the device defined at the top.

Time taken to store embeddings: 2205.2088961601257 seconds
Result 1:
  Subtitle Name: EPICA.OMEGA.ALIVE.(2021).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9187986
--------------------
Result 2:
  Subtitle Name: SNAKE.REVENGE.SNAKE.ISLAND.HORROR.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9232865
--------------------
Result 3:
  Subtitle Name: VAN.HELSING.S01.E02.SEEN.YOU.(2016).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9215792
--------------------
Result 4:
  Subtitle Name: KILLING.SEASON.(2013).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9431693
--------------------
Result 5:
  Subtitle Name: FAMILY.GUY.S08.E17.BRIAN.STEWIE.(2010).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9420983
--------------------


## `Testing again the search mechanism via quering a basic statement.`

In [None]:
query = "I hate u!"
retrieve_and_display_results(query=query, collection=collection, df=df_sampled, model_name=model_name, device=device)

Result 1:
  Subtitle Name: HOT.IN.CLEVELAND.S06.E21.SAY.YES.TO.THE.MESS.(2015).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9417389
--------------------
Result 2:
  Subtitle Name: WELCOME.TO.DEMONSCHOOL.IRUMAKUN.S01.E18.WHAT.I.WANT.MORE.THAN.ANYTHING.(2020).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9267932
--------------------
Result 3:
  Subtitle Name: LEHIYOT.ITA.S02.E04.IN.A.WEDDING.DRESS.(2017).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9417228
--------------------
Result 4:
  Subtitle Name: PARKER.LEWIS.CANT.LOSE.S03.E22.THE.LAST.SUPPER.(1993).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9452984
--------------------
Result 5:
  Subtitle Name: THE.GOLDBERGS.S10.E03.JENKINTOWN.AFTER.DARK.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9265462
--------------------


# `Thank you at last it is done.`