In [1]:
#Step 1: Install Required Libraries

!pip install torch pandas chromadb sentence-transformers openai-whisper ffmpeg scipy soundfile

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-non

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Step 2: Import Libraries

import torch
import whisper
import numpy as np
import pandas as pd
import chromadb
import scipy.spatial.distance as distance
from sentence_transformers import SentenceTransformer
import soundfile as sf


In [4]:
#step 3:Load Whisper Model (base model for fast transcription)
whisper_model = whisper.load_model("base")


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 139MiB/s]


In [5]:
#Step 4: Upload Audio File for Testing

from google.colab import files

uploaded = files.upload()
audio_file = list(uploaded.keys())[0]  # Get the uploaded file name
print(f"Uploaded File: {audio_file}")


Saving 2secondsection-59838.mp3 to 2secondsection-59838.mp3
Uploaded File: 2secondsection-59838.mp3


In [6]:
# Transcribe the uploaded audio file
result = whisper_model.transcribe(audio_file)
query_text = result["text"]
print(f"Transcribed Query: {query_text}")


Transcribed Query:  I can't just walk out of there. You just did. He's still my boss. I know you don't work here anymore. I know you too had your disagreements or whatever. Disagreements? I happen to like what I do and I'm not taking sides. So you're going to go back in there and watch a man you know to be a fraud and a harasser and a cheat and at the very least negligent I get that your heart. This is not about hurt. It's not? It's about me not being able to stand by while it's okay. You can go back inside okay? Just give me a minute. I'm going to hang out with these creepy leprechauns. Why are there leprechauns? Who knows. Who knows. I mean is there a sign or something? A sign? A sign that tells about this art. The Jinx Room murals. Jinx Room is right. I know I shouldn't have come tonight. What you shouldn't have done is drawn out of there. Do you think you saw me? Of course he saw you. But whatever. He's harmless. You honestly still think that. It's not just the work studies didn't.

In [9]:
#Step 6: Load ChromaDB Collection

# Load ChromaDB Client
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/search_engine/db")  # Adjust the path if needed
collection = chroma_client.get_collection("subtitle_chunks")  # Load subtitles collection

# Load subtitle metadata
df = pd.read_parquet("/content/drive/MyDrive/search_engine/files/subtitles_extracted.parquet")  # Adjust path if needed

In [10]:
#Step 7: Generate Embedding for the Query

# Load Sentence Transformer Model
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Generate embedding for query text
query_embedding = embedder.encode([query_text])[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Retrieve top 5 similar results from ChromaDB
results = collection.query(
    query_embeddings=[query_embedding.tolist()],  # Your query embedding
    n_results=5,  # Top 5 results
    include=["metadatas"]  # Include metadata
)

# Display the top 5 results
for i, metadata in enumerate(results["metadatas"][0]):
    subtitle_name = metadata.get("subtitle_name", "Unknown Episode")  # Adjusted key for subtitle name
    subtitle_index = metadata.get("original_index", "Index found")  # Adjusted key for content
    subtitle_id = metadata.get("subtitle_id", "Unknown ID")  # Assuming subtitle_id is stored as metadata

    # Print the result details
    print(f"{i + 1}. 🎥 **{subtitle_name}**")
    print(f"📌 {subtitle_index}")
    print(f"🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/{subtitle_id})")
    print("-" * 50)


1. 🎥 **lili.(1953).eng.1cd**
📌 21687
🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/9380899)
--------------------------------------------------
2. 🎥 **murdoch.mysteries.s16.e06.clean.hands.(2022).eng.1cd**
📌 4192
🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/9279567)
--------------------------------------------------
3. 🎥 **step.dave.s02.e02.mr.popular.(2015).eng.1cd**
📌 8175
🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/9443259)
--------------------------------------------------
4. 🎥 **gate.(2018).eng.1cd**
📌 10018
🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/9499391)
--------------------------------------------------
5. 🎥 **kotigobba.3.(2021).eng.1cd**
📌 21515
🔗 [Link to Subtitle](https://www.opensubtitles.org/en/subtitles/9419527)
--------------------------------------------------
