# **Extracting & Cleaning Subtitles :**

In [1]:
# !pip install chromadb

In [9]:
import pandas as pd
import numpy as np
import sqlite3
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import chromadb

### **Reading the Data from Database :**

In [10]:
# Connect to the database
db_path = "/kaggle/input/eng-subtitles-database/eng_subtitles_database.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

### **Reading the Tables from Database file :**

In [11]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Available Tables:", tables)

Available Tables: [('zipfiles',)]


In [12]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


In [13]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [15]:
df.duplicated().value_counts()

False    82498
Name: count, dtype: int64

In [16]:
print(df.iloc[0, 2])

b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x99V\x9fx\x96\xf0\x8c\x9e\x00\x00\x86\x9b\x01\x00;\x00\x00\x00The.Message.1976.REMASTERED.1080p.BluRay.x264-PiGNUS.EN.srt\xad\xbdm\x93\xdc\xc6\x91.\xfa\x9d\x11\xfc\x0f-}\xe1=\x11-\x9d\x06P\x85\x17\x9d\x8d\xd5%%[\xa4-Y>&u\x15>\xdf\xd0\xd3\x98\x19x\xfae\x0cts<\xfe\xf57\x9f\'\xb3\n\xd9\xa4\xbc\xbb\xf7\xc6Fl\xacELW\xa2\xaa\x90\x95\x95\xafO\x16/_l6\xdf\xe0\xff\xea\xf5f\xb3Y}\xf5\xd5\xbf\xaf\xf4AQ\xae7Mx\xf9\xe2\xd7\xfe|s\xbf\xea\x8f\xcf\xab\x8f\xe3n8\xadN\xc7\xfdx\x1cVO\xe3\xf9~\xf5\xf3\xe3p\xfc\xea\xfd/o>\xbc\xfb\xf0\xe3\xef\xde\xbf|\xf1\xfbi\x18Vo\xa6\xd3\xd3<L\xab\xe1\x1f\xe7\xe18\x8f\xa7\xe37\xab\xd3\xbc\xdb~-\xc3\x1e\xfe\xa7<|\xf9\xe2\xe5\x8bR_[~S\xd6\xeb\xa2k\xf3k\xe5A\xb7\xeeb\xf5\xf2\xc5\xbb\xe3\xea|?\xac\x8e\xfdaX\x9dnW?\x9cvk>8\x9c\xe6\xf3\xean\xeao\xc6\xd3ev\x8f~\x1a\xa6\x9b\xf1\xf6\xb2\xff\x1a\xe4\xabD\xbe*d\x11\xa5#_U\xeb\xaa\xd9`\xa6\xa7\xc3\xea\xa7\xcb}\x7f8\xf4F\xf9\xa7a\x9e\x87\xe3\x9d\xcc\\\xdf\x07B!\x13\xaa\xd61n<!\xd9\xaf\xd0\

### **Unzipping the content :**

In [18]:
import zipfile
import io

binary_data = df.iloc[0, 2]

# Decompress the binary data using the zipfile module
with io.BytesIO(binary_data) as f:
    with zipfile.ZipFile(f, 'r') as zip_file:
        # Reading only one file in the ZIP archive
        subtitle_content = zip_file.read(zip_file.namelist()[0])

# Now 'subtitle_content' should contain the extracted subtitle content
print(subtitle_content.decode('latin-1'))  # Assuming the content is latin-1 encoded text

1
00:00:06,000 --> 00:00:12,074
Watch any video online with Open-SUBTITLES
Free Browser extension: osdb.link/ext

2
00:02:26,198 --> 00:02:29,953
In the name of God, the most gracious, the most Merciful.

3
00:02:31,072 --> 00:02:33,370
From Muhammad, the Messenger of God

4
00:02:33,550 --> 00:02:36,047
to Heraclius, the emperor of Byzantium.

5
00:02:36,407 --> 00:02:39,464
greetings to him who is the
follower of righteous guidance.

6
00:02:39,783 --> 00:02:42,591
I bid you to hear the divine call.

7
00:02:43,160 --> 00:02:45,817
I am the messenger of God to the people;

8
00:02:46,337 --> 00:02:48,784
accept Islam for your salvation.

9
00:02:52,231 --> 00:02:54,709
He speaks of a new prophet in Arabia.

10
00:02:55,068 --> 00:02:57,825
Was it like this when John, the Baptist
came to king Herod

11
00:02:58,145 --> 00:03:01,272
out of the desert, crying about salvation?

12
00:03:26,136 --> 00:03:28,903
To Muqawqis, Patriarch of Alexandria.

13
00:03:42,400 --> 00:03:44,638
Kisra,

### **Applying the above Function on the Entire Data :**

In [19]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    return subtitle_content.decode('latin-1')

In [20]:
df.shape

(82498, 3)

In [21]:
df2 = df.copy()
df2.shape

(82498, 3)

In [22]:
df2.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [25]:
# sample_df = df2.head(50000)
# sample_df.shape

In [26]:
df2['file_content'] = df2['content'].apply(decode_method)
df2.head()

Unnamed: 0,num,name,content,file_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [27]:
df2.shape

(82498, 4)

In [28]:
df2 = df2.drop('content', axis=1)
df2.head()

Unnamed: 0,num,name,file_content
0,9180533,the.message.(1976).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker.(2022).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [35]:
df2.shape

(82498, 4)

### **Cleaning text :**

In [30]:
import re

def clean_text(text):
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\r\n', '', text)
    text = re.sub(r'\r\n', ' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [None]:
df2["cleaned_subtitles"] = df2["file_content"].apply(clean_text)

In [37]:
df3 = df2.copy()
df3.shape

(82498, 4)

In [38]:
df3.head()

Unnamed: 0,num,name,file_content,cleaned_subtitles
0,9180533,the.message.(1976).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",Watch any video online with OpenSUBTITLES Free...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...",Ah Theres Princess Dawn and Terry with the Blo...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...",Yumis Cells Episode Extremely Polite Yumi Yumi...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",Watch any video online with OpenSUBTITLES Free...
4,9180600,broker.(2022).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",Watch any video online with OpenSUBTITLES Free...


In [39]:
df3 = df3.drop('file_content',axis=1)
df3.head()

Unnamed: 0,num,name,cleaned_subtitles
0,9180533,the.message.(1976).eng.1cd,Watch any video online with OpenSUBTITLES Free...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,Ah Theres Princess Dawn and Terry with the Blo...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,Yumis Cells Episode Extremely Polite Yumi Yumi...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,Watch any video online with OpenSUBTITLES Free...
4,9180600,broker.(2022).eng.1cd,Watch any video online with OpenSUBTITLES Free...


In [40]:
# Rename columns
df3 = df3.rename(columns={'cleaned_subtitles': 'content'})

In [32]:
df3.shape

(82498, 3)

In [41]:
print(df3['content'][0])

Watch any video online with OpenSUBTITLES Free Browser extension osdblinkext In the name of God the most gracious the most Merciful From Muhammad the Messenger of God to Heraclius the emperor of Byzantium greetings to him who is the follower of righteous guidance I bid you to hear the divine call I am the messenger of God to the people accept Islam for your salvation He speaks of a new prophet in Arabia Was it like this when John the Baptist came to king Herod out of the desert crying about salvation To Muqawqis Patriarch of Alexandria Kisra emperor of Persia Muhammad calls you with the call of God Accept Islam for your salvation embrace Islam You come out of the desert smelling of camel and goat To tell Persia where he should kneel Muhammad Messenger of God Who gave him this authority God sent Muhammad as a mercy to mankind The Scholars and Historians of Islam The University of AlAzhar in Cairo The High Islamic Congress of the Shiat in Lebanon The makers of this film honour the Islami

In [42]:
# saving in kaggle
df3.to_parquet("/kaggle/working/cleaned_subtitles.parquet", engine="pyarrow", index=False)

# **Text Tokenization / Chunking :**

In [45]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
# Load the dataset
path = r'/kaggle/working/cleaned_subtitles.parquet'
df = pd.read_parquet(path)

In [44]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,Watch any video online with OpenSUBTITLES Free...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,Ah Theres Princess Dawn and Terry with the Blo...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,Yumis Cells Episode Extremely Polite Yumi Yumi...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,Watch any video online with OpenSUBTITLES Free...
4,9180600,broker.(2022).eng.1cd,Watch any video online with OpenSUBTITLES Free...


In [61]:
sample_df = df.sample(frac=0.40, random_state=42)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32999 entries, 17262 to 53660
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      32999 non-null  int64 
 1   name     32999 non-null  object
 2   content  32999 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [62]:
sample_df.shape

(32999, 3)

In [56]:
sample_df.head()

Unnamed: 0,num,name,content
17262,9251120,maybe.this.time.(2014).eng.1cd,Watch any video online with OpenSUBTITLES Free...
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,Oh I know that its getting late but I dont wan...
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,Timing and Subtitles by The Uncontrollable Lov...
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music apiOpenSubtitlesorg is deprecat...
54266,9408707,battlebots.(2015).eng.1cd,Chris Oh no not the Minibots yelling Oh You le...


## **Implement Recursive Chunking :**

In [60]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunking(df, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, chunk_overlap= chunk_overlap)
    
    chunks = []
    metadatas = []
    
    for index, row in df.iterrows():
        doc_chunks = text_splitter.split_text(row["content"])  
        chunks.extend(doc_chunks)  
        
        # Store subtitle ID & name in metadata
        metadatas.extend([{"original_index": index, "subtitle_id": row["num"], "subtitle_name": row["name"]}] * len(doc_chunks))

    # Create new dataframe with chunked text
    chunked_df = pd.DataFrame({"chunk": chunks, "metadata": metadatas})
    return chunked_df

In [63]:
# Apply chunking
chunked_df = chunking(sample_df)

In [64]:
# Save chunked dataset
chunked_df.to_parquet("chunked_subtitles.parquet", engine="pyarrow", index=False)
print("Chunking completed and saved!")

Chunking completed and saved!
