In [None]:
# Checking whether my runtime has GPU or not.

In [1]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


GPU Available: True
GPU Name: Tesla T4


In [6]:
!pip install sentence_transformers -q

In [4]:
!pip install chromadb



In [5]:
# The second command installs ChromaDB and sentence-transformers for embedding storage.

!pip install chromadb sentence-transformers



In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
df=pd.read_parquet(r"/content/drive/MyDrive/search_engine/files/subtitles_extracted.parquet")


In [9]:
df.head()

Unnamed: 0,chunk,metadata
0,Watch any video online with OpenSUBTITLES Free...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
1,on Keep dancing Whatever Im kidding Dont get m...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
2,And you Douche Handsome Conceited Just like yo...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
3,that so Yes How long will this program run If ...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
4,her Gramps Uncle Erning Aunt Elma this is Tep ...,"{'original_index': 0, 'subtitle_id': 9251120, ..."


In [10]:
df_sampled = df.sample(frac=0.30, random_state=42)

In [11]:
df_sampled.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 466282 entries, 1320107 to 1294123
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   chunk     466282 non-null  object
 1   metadata  466282 non-null  object
dtypes: object(2)
memory usage: 338.8 MB


In [12]:
df_sampled.shape

(466282, 2)

In [13]:
# Model name
model_name = "all-MiniLM-L6-v2"

In [14]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import time
import torch

In [15]:

import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda:0')

sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences, convert_to_tensor=True)

print(embeddings.device) # should print cuda:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


cuda:0


In [16]:
def generate_embeddings(texts, model_name, batch_size=64, device="cuda:0"):
    """Generates embeddings for a list of texts."""
    model = SentenceTransformer(model_name, device=device)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    return embeddings

In [17]:
def store_embeddings_chroma(chunked_df, collection_name="subtitle_chunks", persist_directory="./chroma_db", batch_size=64):
    """Stores embeddings in ChromaDB with persistence."""

    start_time = time.time()  # Start time to measure performance

    # Create or connect to the persistent ChromaDB client
    client = chromadb.PersistentClient(path=persist_directory)

    # Use the SentenceTransformer embedding function (ensure model_name is defined)
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

    try:
        # Attempt to delete the existing collection if it exists
        client.delete_collection(name=collection_name)
    except ValueError:
        pass  # Collection doesn't exist, so we continue

    # Get or create the collection in ChromaDB
    collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

    # Process the DataFrame in batches
    for i in range(0, len(chunked_df), batch_size):
        batch_df = chunked_df.iloc[i:i + batch_size]
        texts = batch_df["chunk"].tolist()  # Subtitle text for embeddings
        metadatas = batch_df["metadata"].tolist()  # Metadata (e.g., subtitle_id, name)
        ids = [str(idx) for idx in batch_df.index.tolist()]  # Unique IDs for the documents

        # Add the batch to the collection
        collection.add(documents=texts, metadatas=metadatas, ids=ids)

    # Measure and print time taken for the operation
    end_time = time.time()
    print(f"Time taken to store embeddings: {end_time - start_time} seconds")

    return collection  # Return the collection for future use


In [18]:
def retrieve_and_display_results(query, collection, df, n_results=5, model_name="all-MiniLM-L6-v2", device="cuda:0"):
    """Retrieves top results and displays them with URLs."""

    # Initialize the SentenceTransformer model
    model = SentenceTransformer(model_name, device=device)

    # Get the query embedding
    query_embedding = model.encode([query], show_progress_bar=False).tolist()

    # Query the collection to retrieve top n results with metadata
    results = collection.query(query_embeddings=query_embedding, n_results=n_results, include=["metadatas"])

    # Iterate over the results and print the details
    for i, metadata in enumerate(results["metadatas"][0]):
        # Extract subtitle details from metadata (ensure keys exist in metadata)
        subtitle_name = metadata.get("subtitle_name", "Unknown Subtitle")
        subtitle_id = metadata.get("subtitle_id", "Unknown ID")

        # Construct the OpenSubtitles URL
        url = f"https://www.opensubtitles.org/en/subtitles/{subtitle_id}"

        # Print the result details
        print(f"Result {i + 1}:")
        print(f"  Subtitle Name: {subtitle_name.upper()}")
        print(f"  URL: {url}")
        print("-" * 20)


In [19]:
import warnings
warnings.filterwarnings("ignore")


In [21]:
# Example Usage
persist_directory = r"/content/drive/MyDrive/search_engine/db/"
# Store embeddings in ChromaDB
collection = store_embeddings_chroma(df_sampled, persist_directory=persist_directory)



In [23]:
import torch

# Check if GPU is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")


Using device: cuda


In [26]:
# Define your query
query = "what is the plan today?"
# Retrieve and display results
retrieve_and_display_results(query=query, collection=collection, df=df_sampled, model_name=model_name, device=device) #uses the device defined at the top.

Result 1:
  Subtitle Name: THE.MOSQUITO.COAST.S02.E03.TALK.ABOUT.THE.WEATHER.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9317090
--------------------
Result 2:
  Subtitle Name: GRAND.CREW.S01.E07.WINE.HEADLINES.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9454011
--------------------
Result 3:
  Subtitle Name: TULSA.KING.S01.E04.VISITATION.PLACE.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9340748
--------------------
Result 4:
  Subtitle Name: I.AM.VANESSA.GUILLEN.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9316091
--------------------
Result 5:
  Subtitle Name: THE.LONG.WAY.HOME.(1997).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9431169
--------------------


In [25]:
query = "How is your day going"
retrieve_and_display_results(query=query, collection=collection, df=df_sampled, model_name=model_name, device=device)

Result 1:
  Subtitle Name: BACHELORETTES.DEGREE.(2013).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9189871
--------------------
Result 2:
  Subtitle Name: THE.SILENT.TWINS.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9262310
--------------------
Result 3:
  Subtitle Name: ONE.TREE.HILL.S08.E14.HOLDING.OUT.FOR.A.HERO.(2011).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9316729
--------------------
Result 4:
  Subtitle Name: THIS.IS.US.S04.E15.CLOUDS.(2020).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9221613
--------------------
Result 5:
  Subtitle Name: A.MAGICAL.CHRISTMAS.VILLAGE.(2022).ENG.1CD
  URL: https://www.opensubtitles.org/en/subtitles/9301852
--------------------
