In [None]:
!pip install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=12314a5b6a72722d08a622625ea76c36d3b8b408a894f010d7135e50262b1e8d
  Stored in directory: /root/.cache/pip/wheels/2b/4d/8f/55fb4f7b9b591891e8d3f72977c4ec6c7763b39c19f0861595
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [None]:
import GPUtil

gpus = GPUtil.getGPUs()
if gpus:
    for gpu in gpus:
        print(f"GPU ID: {gpu.id}, Name: {gpu.name}, Load: {gpu.load*100}%, Free Memory: {gpu.memoryFree}MB")
else:
    print("No NVIDIA GPU detected.")

GPU ID: 0, Name: Tesla T4, Load: 0.0%, Free Memory: 15095.0MB


In [None]:
!pip install chromadb sentence_transformers -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00

In [None]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from chromadb.utils import embedding_functions

In [None]:
# Dataframe path
PATH = r'/content/drive/MyDrive/search_engine/files/subtitles_extracted.parquet'

df = pd.read_parquet(PATH)

In [None]:
df_sampled = df.sample(frac=0.30, random_state=42)

In [None]:
df_sampled.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 466282 entries, 1320107 to 1294123
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   chunk     466282 non-null  object
 1   metadata  466282 non-null  object
dtypes: object(2)
memory usage: 338.8 MB


In [None]:
model_name = "all-MiniLM-L6-v2"

In [None]:
def generate_embeddings(texts, model_name, batch_size=32, device='cuda:0'):
    """Generates embeddings for a list of texts."""
    model = SentenceTransformer(model_name, device=device)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    return embeddings

In [None]:
def store_embeddings_chroma(chunked_df, collection_name="subtitle_chunks", persist_directory="./chroma_db", batch_size=32):
    """Stores embeddings in ChromaDB with persistence."""
    # Initialize ChromaDB persistent client
    client = chromadb.PersistentClient(path=persist_directory)
    # Define embedding function
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    # Create or retrieve collection
    collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

    # Process data in batches
    for i in range(0, len(chunked_df), batch_size):
        batch_df = chunked_df.iloc[i:i + batch_size]
        texts = batch_df["chunk"].tolist()
        metadatas = batch_df["metadata"].tolist()
        ids = [str(idx) for idx in batch_df.index.tolist()]

        # Add data to ChromaDB collection
        collection.add(documents=texts, metadatas=metadatas, ids=ids)

    return collection

In [None]:
def retrieve_and_display_results(query, collection, df, n_results=5, model_name=None, device='cuda:0'):
    """Retrieves top results and displays them with URLs."""
   # Initialize model
    model = SentenceTransformer(model_name, device=device)
    # Generate embedding for the query
    query_embedding = model.encode([query], show_progress_bar=False)
    # Perform similarity search in the collection
    results = collection.query(query_embeddings=query_embedding.tolist(), n_results=n_results, include=["metadatas"])

    # Display results with URLs
    for i, metadata in enumerate(results["metadatas"][0]):
        subtitle_name = metadata["subtitle_name"]
        subtitle_id = metadata["subtitle_id"]
        url = f"https://www.opensubtitles.org/en/subtitles/{subtitle_id}"
        print(f"Result {i + 1}:")
        print(f"  Subtitle Name: {subtitle_name.upper()}")
        print(f"  URL: {url}")
        print("-" * 20)

In [None]:
# Example Usage:
persist_directory = r"/content/drive/MyDrive/search_engine/db/" #directory where chromadb will save the data.
collection = store_embeddings_chroma(df, persist_directory=persist_directory)
query = "What is the meaning of life?"
retrieve_and_display_results(query=query, collection=collection, df=df, model_name=model_name) # df is the original dataframe with subtitle_id and name

# Loading the database from another device or location:
# Load the persistent client
# loaded_client = chromadb.PersistentClient(path=persist_directory)
# Get the collection
# loaded_collection = loaded_client.get_collection(name="subtitle_chunks")
#use the collection for queries.
# retrieve_and_display_results(query, loaded_collection, df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 