# **Chunking:**

In [1]:
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
path = r'/kaggle/input/cleaned-subtitles/cleaned_subtitles.parquet'
df = pd.read_parquet(path)
df.shape

(82498, 3)

In [3]:
df = df.sample(frac=0.30, random_state=42)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 24749 entries, 17262 to 67859
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      24749 non-null  int64 
 1   name     24749 non-null  object
 2   content  24749 non-null  object
dtypes: int64(1), object(2)
memory usage: 612.5 MB


In [4]:
df.shape

(24749, 3)

In [5]:
df.head(5)

Unnamed: 0,num,name,content
17262,9251120,maybe.this.time.(2014).eng.1cd,Watch any video online with OpenSUBTITLES Free...
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,Oh I know that its getting late but I dont wan...
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,Timing and Subtitles by The Uncontrollable Lov...
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music apiOpenSubtitlesorg is deprecat...
54266,9408707,battlebots.(2015).eng.1cd,Chris Oh no not the Minibots yelling Oh You le...


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents_langchain(df, chunk_size=512, chunk_overlap=100):
    # Chunks documents using Langchain and returns a new DataFrame with metadata.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, chunk_overlap= chunk_overlap)

    chunks = []
    metadatas = []
    for index, row in df.iterrows():
        doc_chunks = text_splitter.split_text(row["content"])
        chunks.extend(doc_chunks)
        # Store subtitle_id and subtitle_name in the metadata
        metadatas.extend([{"original_index": index,"subtitle_id": row["num"],"subtitle_name": row["name"]}] * len(doc_chunks))

    chunk_df = pd.DataFrame({"chunk": chunks, "metadata": metadatas})
    return chunk_df

In [7]:
chunked_df = chunk_documents_langchain(df)
print(chunked_df.head(2))

                                               chunk  \
0  Watch any video online with OpenSUBTITLES Free...   
1  on Keep dancing Whatever Im kidding Dont get m...   

                                            metadata  
0  {'original_index': 17262, 'subtitle_id': 92511...  
1  {'original_index': 17262, 'subtitle_id': 92511...  


In [8]:
chunked_df.head()

Unnamed: 0,chunk,metadata
0,Watch any video online with OpenSUBTITLES Free...,"{'original_index': 17262, 'subtitle_id': 92511..."
1,on Keep dancing Whatever Im kidding Dont get m...,"{'original_index': 17262, 'subtitle_id': 92511..."
2,And you Douche Handsome Conceited Just like yo...,"{'original_index': 17262, 'subtitle_id': 92511..."
3,that so Yes How long will this program run If ...,"{'original_index': 17262, 'subtitle_id': 92511..."
4,her Gramps Uncle Erning Aunt Elma this is Tep ...,"{'original_index': 17262, 'subtitle_id': 92511..."


In [9]:
import os

path = "/kaggle/working/"

# Check if the directory exists, create it if it doesn't
if not os.path.exists(path):
    os.makedirs(path)
    print(f"Directory created: {path}")
else:
    print(f"Directory already exists: {path}")

# Save the DataFrame as a Parquet file
try:
    output_path = os.path.join(path, "subtitles_extracted.parquet")
    chunked_df.to_parquet(output_path, engine="pyarrow", index=False)
    print("Successfully saved the dataset to:", output_path)
except Exception as e:
    print("Error:", e)

Directory already exists: /kaggle/working/
Successfully saved the dataset to: /kaggle/working/subtitles_extracted.parquet


# **GPU-Accelerated Embeddings :**

In [20]:
!pip install chromadb sentence-transformers
!pip install sentence_transformers -q
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.31.1-py

In [21]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import time
import os


import warnings
warnings.filterwarnings("ignore")

In [23]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

GPU Available: True
GPU Name: Tesla T4


In [24]:
df = pd.read_parquet(r"/kaggle/working/subtitles_extracted.parquet")
df.shape

(1554274, 2)

In [25]:
df.head()

Unnamed: 0,chunk,metadata
0,Watch any video online with OpenSUBTITLES Free...,"{'original_index': 17262, 'subtitle_id': 92511..."
1,on Keep dancing Whatever Im kidding Dont get m...,"{'original_index': 17262, 'subtitle_id': 92511..."
2,And you Douche Handsome Conceited Just like yo...,"{'original_index': 17262, 'subtitle_id': 92511..."
3,that so Yes How long will this program run If ...,"{'original_index': 17262, 'subtitle_id': 92511..."
4,her Gramps Uncle Erning Aunt Elma this is Tep ...,"{'original_index': 17262, 'subtitle_id': 92511..."


In [26]:
df_sampled = df.sample(frac=0.30, random_state=42)
df_sampled.shape

(466282, 2)

In [27]:
df_sampled.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 466282 entries, 1320107 to 1294123
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   chunk     466282 non-null  object
 1   metadata  466282 non-null  object
dtypes: object(2)
memory usage: 360.1 MB


In [28]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

# Function to compute embeddings in batches
def compute_embeddings(texts, batch_size=128):
    embeddings = model.encode(texts, batch_size=batch_size, convert_to_tensor=True)
    return embeddings.half().cpu().numpy()  # Convert to fp16 for memory efficiency

# Compute embeddings
df_sampled["embedding"] = list(compute_embeddings(df_sampled["chunk"].tolist()))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3643 [00:00<?, ?it/s]

In [29]:
# Save embeddings
np.save("/kaggle/working/embeddings.npy", np.vstack(df_sampled["embedding"].values))
df_sampled.to_parquet("/kaggle/working/embedded_subtitles.parquet", engine="pyarrow", index=False)
print("Saved embeddings.")

Saved embeddings.


## **ChromaDB Storage :**

In [31]:
# Load processed data
df = pd.read_parquet("/kaggle/working/embedded_subtitles.parquet")
embeddings = np.load("/kaggle/working/embeddings.npy")

In [35]:
embeddings

array([[ 0.01602 , -0.0632  ,  0.0677  , ...,  0.0636  , -0.0644  ,
        -0.02686 ],
       [-0.00881 ,  0.045   ,  0.02475 , ..., -0.04843 , -0.04456 ,
         0.012886],
       [-0.1261  ,  0.06793 ,  0.0874  , ...,  0.04004 ,  0.02153 ,
        -0.0643  ],
       ...,
       [-0.03735 , -0.0752  , -0.0631  , ...,  0.08984 , -0.04996 ,
        -0.03735 ],
       [-0.06274 ,  0.01458 ,  0.0907  , ..., -0.007504, -0.03726 ,
        -0.01078 ],
       [-0.02106 ,  0.05716 , -0.0317  , ...,  0.0758  ,  0.01198 ,
        -0.01066 ]], dtype=float16)

In [33]:
df.head()

Unnamed: 0,chunk,metadata,embedding
0,looked so pretty for Mr OFinn Dialogue Dialogu...,"{'original_index': 78775, 'subtitle_id': 95068...","[0.01602, -0.0632, 0.0677, -0.0263, -0.03888, ..."
1,it tomorrow I will fight it next week and next...,"{'original_index': 12125, 'subtitle_id': 92314...","[-0.00881, 0.045, 0.02475, -0.02104, 0.0617, 0..."
2,Connor No See This is not going to go well for...,"{'original_index': 70828, 'subtitle_id': 94728...","[-0.1261, 0.06793, 0.0874, -0.0403, 0.0654, 0...."
3,with the Pakleds You four stood strong in an a...,"{'original_index': 15149, 'subtitle_id': 92443...","[0.05502, 0.03296, -0.02739, -0.0562, 0.03656,..."
4,your wallet now Come on Give me your wallet Gi...,"{'original_index': 39546, 'subtitle_id': 93435...","[-0.0826, 0.02043, 0.04205, -0.0425, -0.05814,..."


In [36]:
# Initialize ChromaDB
persist_directory = "/kaggle/working/chroma_db"
os.makedirs(persist_directory, exist_ok=True)

client = chromadb.PersistentClient(path= persist_directory)
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Create or get collection
collection_name = "subtitle_chunks"
try:
    client.delete_collection(name=collection_name)
except ValueError:
    pass  # Collection doesn't exist, continue

collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

# Batch insert embeddings
batch_size = 512
for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size]
    collection.add(
        ids=[str(idx) for idx in batch_df.index],
        documents=batch_df["chunk"].tolist(),
        embeddings=embeddings[i:i+batch_size].tolist(),
        metadatas=batch_df["metadata"].tolist())

print(f"ChromaDB collection created with {collection.count()} documents.")

ChromaDB collection created with 466282 documents.


In [37]:
import chromadb
from sentence_transformers import SentenceTransformer

# Load ChromaDB
persist_directory = "/kaggle/working/chroma_db"
client = chromadb.PersistentClient(path=persist_directory)
collection = client.get_collection(name="subtitle_chunks")

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

# Function to query
def search_subtitles(query, n_results=5):
    query_embedding = model.encode([query], convert_to_tensor=True).half().cpu().numpy().tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=n_results, include=["metadatas"])

    for i, metadata in enumerate(results["metadatas"][0]):
        subtitle_id = metadata.get("subtitle_id", "Unknown ID")
        subtitle_name = metadata.get("subtitle_name", "Unknown Subtitle")
        url = f"https://www.opensubtitles.org/en/subtitles/{subtitle_id}"
        print(f"Result {i+1}: {subtitle_name}\nURL: {url}\n")

# Example query
search_subtitles('Life is like a box of chocolates')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Result 1: the.real.housewives.of.beverly.hills.s12.e19.we.need.to.talk.about.kathy.().eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9265204

Result 2: penn.teller.fool.us.s08.e03.happy.birthday.(2021).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9189307

Result 3: all.creatures.great.and.small.s02.e01.where.the.heart.is.(2021).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9379828

Result 4: mindwalk.(1990).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9380709

Result 5: family.guy.s01.e01.death.has.a.shadow.(1999).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9408326



In [38]:
search_subtitles("Let the robot carnage begin!")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Result 1: futurama.s05.e01.crimes.of.the.hot.(2002).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9438922

Result 2: futurama.s01.e05.fear.of.a.bot.planet.(1999).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9429794

Result 3: futurama.s09.e12.31st.century.fox.(2012).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9440082

Result 4: nova.s43.e08.rise.of.the.robots.(2016).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9259578

Result 5: nova.s43.e08.rise.of.the.robots.(2016).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9259578



In [39]:
search_subtitles("Why so serious?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Result 1: scrubs.s08.e13.my.full.moon.(2009).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9274648

Result 2: mischievous.kiss.love.in.tokyo.s02.e03.hello.baby.(2014).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9265356

Result 3: do.badan.(1966).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9290962

Result 4: a.mulher.do.meu.marido.(2019).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9330222

Result 5: inside.job.s02.e07.project.reboot.(2022).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9317333



In [43]:
search_subtitles("you're really... such a hopeless idiot")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Result 1: zeroman.s01.e04.artificial.intolerance.(2004).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9424496

Result 2: the.glory.s01.e15.episode.1.15.(2023).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9463872

Result 3: empire.s04.e09.slave.to.memory.(2017).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9199242

Result 4: chudy.i.inni.(1967).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9271158

Result 5: machan.(2008).eng.1cd
URL: https://www.opensubtitles.org/en/subtitles/9365929

