In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sqlite3
import pandas as pd

# Path to the SQLite database file
db_file_path = '/content/drive/MyDrive/eng_subtitles_database.db'

# Connect to the database
conn = sqlite3.connect(db_file_path)

# Query to select the 'num', 'name', and 'content' columns from the 'zipfiles' table
query = 'SELECT num, name FROM zipfiles'

# Create a DataFrame from the query result
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the first few rows of the DataFrame
print(df.head())


       num                                               name
0  9180533                         the.message.(1976).eng.1cd
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd
4  9180600                              broker.(2022).eng.1cd


In [None]:
df['index'] = df.index


In [None]:
df.head()

Unnamed: 0,num,name,index
0,9180533,the.message.(1976).eng.1cd,0
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,1
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,2
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,3
4,9180600,broker.(2022).eng.1cd,4


In [None]:
df = df.reindex(columns=['index', 'num', 'name'])

In [None]:
df.head()

Unnamed: 0,index,num,name
0,0,9180533,the.message.(1976).eng.1cd
1,1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...
2,2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd
3,3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd
4,4,9180600,broker.(2022).eng.1cd


In [None]:
print(df[['index', 'num', 'name']])


       index      num                                               name
0          0  9180533                         the.message.(1976).eng.1cd
1          1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...
2          2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd
3          3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd
4          4  9180600                              broker.(2022).eng.1cd
...      ...      ...                                                ...
82493  82493  9521935                   the.prophets.game.(2000).eng.1cd
82494  82494  9521937                         west.beirut.(1998).eng.1cd
82495  82495  9521938         frankenstein.the.true.story.(1973).eng.1cd
82496  82496  9521940         frankenstein.the.true.story.(1973).eng.1cd
82497  82497  9521941              zombie.island.massacre.(1984).eng.1cd

[82498 rows x 3 columns]


In [None]:
import os

# Path to the subtitles folder in your Google Drive
subtitles_folder_path = '/content/drive/MyDrive/subtitles'

# List all files in the subtitles folder
srt_files = [f for f in os.listdir(subtitles_folder_path) if f.endswith('.srt')]

# Create a DataFrame with the list of .srt files
df_subtitles = pd.DataFrame({'filename': srt_files})

# Display the DataFrame
print(df_subtitles)


                 filename
0      subtitle_81037.srt
1      subtitle_80807.srt
2      subtitle_81033.srt
3      subtitle_81189.srt
4      subtitle_80967.srt
...                   ...
68835   subtitle_1648.srt
68836   subtitle_1246.srt
68837   subtitle_1458.srt
68838   subtitle_1402.srt
68839   subtitle_1744.srt

[68840 rows x 1 columns]


In [None]:
# Extract the index from the filename
df_subtitles['index'] = df_subtitles['filename'].str.extract(r'subtitle_(\d+)\.srt').astype(int)

# Display the DataFrame
print(df_subtitles)


                 filename  index
0      subtitle_81037.srt  81037
1      subtitle_80807.srt  80807
2      subtitle_81033.srt  81033
3      subtitle_81189.srt  81189
4      subtitle_80967.srt  80967
...                   ...    ...
68835   subtitle_1648.srt   1648
68836   subtitle_1246.srt   1246
68837   subtitle_1458.srt   1458
68838   subtitle_1402.srt   1402
68839   subtitle_1744.srt   1744

[68840 rows x 2 columns]


In [None]:
import numpy as np

# Sample 30% of indices from the database
sampled_indices = np.random.choice(df['index'], size=int(0.3*len(df)), replace=False)

# Filter the database DataFrame based on the sampled indices
df_sampled_db = df[df['index'].isin(sampled_indices)]

# Filter the .srt files DataFrame based on the sampled indices
df_sampled_srt = df_subtitles[df_subtitles['index'].isin(sampled_indices)]

# Display the sampled DataFrames
print("Sampled database data:")
print(df_sampled_db)

print("\nSampled .srt files data:")
print(df_sampled_srt)


Sampled database data:
       index      num                                               name
0          0  9180533                         the.message.(1976).eng.1cd
2          2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd
5          5  9180607                            the.myth.(2005).eng.1cd
6          6  9180608                    the.great.beauty.(2013).eng.1cd
9          9  9180694  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...
...      ...      ...                                                ...
82482  82482  9521866  vanished.with.beth.holloway.s01.e08.fleischman...
82489  82489  9521930                         the.fearway.(2023).eng.1cd
82490  82490  9521931                           immanence.(2022).eng.1cd
82492  82492  9521933              star.trek.insurrection.(1998).eng.1cd
82495  82495  9521938         frankenstein.the.true.story.(1973).eng.1cd

[24749 rows x 3 columns]

Sampled .srt files data:
                 filename  index
2      subtitle_

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import os
import re
from tqdm import tqdm

# Function to clean the contents of an .srt file
def clean_srt_content(content):
    # Remove timestamps
    content = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+\n', '', content)
    # Remove extra spaces
    content = re.sub(r'\s+', ' ', content)
    # Remove special characters
    content = re.sub(r'[^a-zA-Z\s]', '', content)
    # Convert to lowercase
    content = content.lower()
    # Remove empty lines
    content = os.linesep.join([s for s in content.splitlines() if s.strip()])
    return content

# Function to process a single file and extract index
def process_file(file_path):
    with open(file_path, 'r') as file:
        file_content = file.read()
        cleaned_content = clean_srt_content(file_content)
        index = int(re.search(r'subtitle_(\d+)\.srt', file_path).group(1))
    return index, cleaned_content

# Path to the subtitles folder in your Google Drive
subtitles_folder_path = '/content/drive/MyDrive/subtitles'

# Use filenames from df_sampled_srt DataFrame
srt_files = df_sampled_srt['filename'].tolist()

# Initialize tqdm to track progress
with tqdm(total=len(srt_files)) as pbar:
    # Initialize an empty list to store tuples of (index, cleaned_content)
    cleaned_contents = []
    # Process files using ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, os.path.join(subtitles_folder_path, f)) for f in srt_files]
        # Use as_completed to iterate over completed futures
        for future in as_completed(futures):
            pbar.update(1)  # Update progress bar for each completed file
            index, cleaned_content = future.result()
            cleaned_contents.append((index, cleaned_content))

# Create a DataFrame with the list of .srt files, cleaned contents, and extracted indices
df_sampled_srt = pd.DataFrame(cleaned_contents, columns=['index', 'content'])

# Display the DataFrame with 'index' and 'content' columns
print(df_sampled_srt)


100%|██████████| 20663/20663 [20:50<00:00, 16.52it/s]

       index                                            content
0      81033   subtitles by danielsangeo  welcome to rosss g...
1      81126   subtitles by danielsangeo  okay act calm act ...
2      81302  script info title default file scripttype v wr...
3      81189   subtitles by danielsangeo  yeah im feeling pr...
4      80998   theme music playing  support us and become vi...
...      ...                                                ...
20658   1327   iyou take everything from mei  iand then you ...
20659   1652   michael im just saying  in the caribbean they...
20660   1647   ipreviously oni nikita  i know who killed you...
20661   1648   bell rings  door closes  do you have haggis h...
20662   1154   support us and become vip member to remove al...

[20663 rows x 2 columns]





In [None]:
# Merge the dataframes based on the 'index' column
merged_df = pd.merge(df_sampled_db, df_sampled_srt, on='index', how='inner')

# Display the merged dataframe
print(merged_df)


       index      num                                               name  \
0          5  9180607                            the.myth.(2005).eng.1cd   
1          6  9180608                    the.great.beauty.(2013).eng.1cd   
2          9  9180694  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   
3         10  9180695  rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd   
4         11  9180696  rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....   
...      ...      ...                                                ...   
20658  82482  9521866  vanished.with.beth.holloway.s01.e08.fleischman...   
20659  82489  9521930                         the.fearway.(2023).eng.1cd   
20660  82490  9521931                           immanence.(2022).eng.1cd   
20661  82492  9521933              star.trek.insurrection.(1998).eng.1cd   
20662  82495  9521938         frankenstein.the.true.story.(1973).eng.1cd   

                                                 content  
0       general the princess

In [None]:
# Rename the index column
merged_df = merged_df.rename(columns={'num': 'subtitle_id'})

merged_df = merged_df.rename(columns={'index': 'SubtitleID_index'})
# Display the updated merged DataFrame
merged_df.head()


Unnamed: 0,SubtitleID_index,subtitle_id,name,content
0,5,9180607,the.myth.(2005).eng.1cd,general the princesss convoy has entered our ...
1,6,9180608,the.great.beauty.(2013).eng.1cd,apiopensubtitlesorg is deprecated please impl...
2,9,9180694,rudrabinar.obhishaap.s02.e03.anandagarher.akhh...,so youre assuming that my grandma mumtaz is ...
3,10,9180695,rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd,you know that naads have less patience who s...
4,11,9180696,rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....,use the free code joinnow at wwwplayshipseu ...


In [None]:
merged_df.shape

(20663, 4)

In [None]:
# Download the WordNet corpus if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

stemmer = PorterStemmer()
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)

    # Change sentence to lowercase
    sentence = sentence.lower()

    # Tokenize into words
    tokens = sentence.split()

    # Lemmatization
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm, tqdm_notebook

In [None]:
tqdm.pandas()

In [None]:
temp_df = merged_df['content'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df.head()

100%|██████████| 20663/20663 [07:04<00:00, 48.71it/s]


Unnamed: 0,0,1
0,general the princess convoy ha entered our rea...,4005
1,apiopensubtitlesorg is deprecated please imple...,9208
2,so youre assuming that my grandma mumtaz is ab...,1178
3,you know that naads have le patience who smugg...,1359
4,use the free code joinnow at wwwplayshipseu ge...,1017


In [None]:
temp_df.columns = ['clean_text_lemma', 'text_length_lemma']
temp_df.head()

Unnamed: 0,clean_text_lemma,text_length_lemma
0,general the princess convoy ha entered our rea...,4005
1,apiopensubtitlesorg is deprecated please imple...,9208
2,so youre assuming that my grandma mumtaz is ab...,1178
3,you know that naads have le patience who smugg...,1359
4,use the free code joinnow at wwwplayshipseu ge...,1017


In [None]:
cleaned_df = pd.concat([merged_df, temp_df], axis=1)
cleaned_df.head()

Unnamed: 0,SubtitleID_index,subtitle_id,name,content,clean_text_lemma,text_length_lemma
0,5,9180607,the.myth.(2005).eng.1cd,general the princesss convoy has entered our ...,general the princess convoy ha entered our rea...,4005
1,6,9180608,the.great.beauty.(2013).eng.1cd,apiopensubtitlesorg is deprecated please impl...,apiopensubtitlesorg is deprecated please imple...,9208
2,9,9180694,rudrabinar.obhishaap.s02.e03.anandagarher.akhh...,so youre assuming that my grandma mumtaz is ...,so youre assuming that my grandma mumtaz is ab...,1178
3,10,9180695,rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd,you know that naads have less patience who s...,you know that naads have le patience who smugg...,1359
4,11,9180696,rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....,use the free code joinnow at wwwplayshipseu ...,use the free code joinnow at wwwplayshipseu ge...,1017


In [None]:
cleaned_df.shape

(20663, 6)

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
print("Loading pre-trained BERT model and tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Example text data
texts = cleaned_df['clean_text_lemma'].tolist()

# Set batch size
batch_size = 32

# List to store the embeddings
embeddings = []

# Process in batches
total_batches = len(texts) // batch_size + 1
with tqdm(total=total_batches, desc="Processing batches") as pbar:
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        inputs = {key: val.to(device) for key, val in inputs.items()}
        inputs['attention_mask'] = inputs['input_ids'].gt(0)
        inputs.pop('token_type_ids', None)  # BERT base model does not use segment_ids
        with torch.no_grad():
            output = model(**inputs)
            last_hidden_state = output.last_hidden_state
            mean_pooled = torch.mean(last_hidden_state, dim=1)
            embeddings.append(mean_pooled.cpu().numpy())

        # Update progress bar
        pbar.update(1)

# Concatenate embeddings
final_embeddings = np.concatenate(embeddings, axis=0)
print("Embeddings generated successfully!")
print("Final embeddings shape:", final_embeddings.shape)


Loading pre-trained BERT model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing batches: 100%|██████████| 646/646 [37:32<00:00,  3.49s/it]

Embeddings generated successfully!
Final embeddings shape: (20663, 768)





In [None]:
# Add the final embeddings to the cleaned_df DataFrame
cleaned_df['final_embeddings'] = final_embeddings.tolist()

# Display the updated DataFrame
print(cleaned_df.head())


   SubtitleID_index  subtitle_id  \
0                 5      9180607   
1                 6      9180608   
2                 9      9180694   
3                10      9180695   
4                11      9180696   

                                                name  \
0                            the.myth.(2005).eng.1cd   
1                    the.great.beauty.(2013).eng.1cd   
2  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   
3  rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd   
4  rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....   

                                             content  \
0   general the princesss convoy has entered our ...   
1   apiopensubtitlesorg is deprecated please impl...   
2   so youre assuming that my grandma mumtaz  is ...   
3   you know that naads have less patience  who s...   
4   use the free code joinnow at wwwplayshipseu  ...   

                                    clean_text_lemma  text_length_lemma  \
0  general the princess convoy ha 

In [None]:
pip install pandas



In [None]:
cleaned_df.head()

Unnamed: 0,SubtitleID_index,subtitle_id,name,content,clean_text_lemma,text_length_lemma,final_embeddings
0,5,9180607,the.myth.(2005).eng.1cd,general the princesss convoy has entered our ...,general the princess convoy ha entered our rea...,4005,"[-0.1895383894443512, 0.1699831336736679, 0.49..."
1,6,9180608,the.great.beauty.(2013).eng.1cd,apiopensubtitlesorg is deprecated please impl...,apiopensubtitlesorg is deprecated please imple...,9208,"[-0.13504022359848022, 0.17723196744918823, 0...."
2,9,9180694,rudrabinar.obhishaap.s02.e03.anandagarher.akhh...,so youre assuming that my grandma mumtaz is ...,so youre assuming that my grandma mumtaz is ab...,1178,"[-0.13107889890670776, 0.1428164541721344, 0.2..."
3,10,9180695,rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd,you know that naads have less patience who s...,you know that naads have le patience who smugg...,1359,"[-0.08960047364234924, 0.04438433051109314, 0...."
4,11,9180696,rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....,use the free code joinnow at wwwplayshipseu ...,use the free code joinnow at wwwplayshipseu ge...,1017,"[-0.11988365650177002, 0.22249145805835724, 0...."


In [None]:
cleaned_df.columns

Index(['SubtitleID_index', 'subtitle_id', 'name', 'content',
       'clean_text_lemma', 'text_length_lemma', 'final_embeddings'],
      dtype='object')

In [None]:
cleaned_df.shape

(20663, 7)

In [None]:
cleaned_df.dtypes

SubtitleID_index      int64
subtitle_id           int64
name                 object
content              object
clean_text_lemma     object
text_length_lemma     int64
final_embeddings     object
dtype: object

In [None]:
def chunk_document(text, chunk_size=500, overlap=50):
    """Chunk a document into smaller parts with specified size and overlap."""
    if isinstance(text, float):
        text = str(text)
    chunks = []
    start_idx = 0
    end_idx = min(chunk_size, len(text))
    while start_idx < len(text):
        chunks.append(text[start_idx:end_idx])
        start_idx += chunk_size - overlap
        end_idx = min(start_idx + chunk_size, len(text))
    return chunks

def chunk_documents(df, text_column='content', chunk_size=500, overlap=50):
    """Chunk the documents in a DataFrame."""
    df['chunks'] = df[text_column].apply(lambda x: chunk_document(x, chunk_size, overlap))
    return df

# Chunk the documents in the DataFrame
chunk_size = 500  # Set your desired chunk size
overlap = 50  # Set the overlap size
chunked_df = chunk_documents(cleaned_df, text_column='clean_text_lemma', chunk_size=chunk_size, overlap=overlap)

# Display the first few rows of the chunked DataFrame
print(chunked_df.head())

   SubtitleID_index  subtitle_id  \
0                 5      9180607   
1                 6      9180608   
2                 9      9180694   
3                10      9180695   
4                11      9180696   

                                                name  \
0                            the.myth.(2005).eng.1cd   
1                    the.great.beauty.(2013).eng.1cd   
2  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   
3  rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd   
4  rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....   

                                             content  \
0   general the princesss convoy has entered our ...   
1   apiopensubtitlesorg is deprecated please impl...   
2   so youre assuming that my grandma mumtaz  is ...   
3   you know that naads have less patience  who s...   
4   use the free code joinnow at wwwplayshipseu  ...   

                                    clean_text_lemma  text_length_lemma  \
0  general the princess convoy ha 

In [None]:
def chunk_document(text, chunk_size=500, overlap=50):
    """Chunk a document into smaller parts with specified size and overlap."""
    if isinstance(text, float):
        text = str(text)
    chunks = []
    start_idx = 0
    end_idx = min(chunk_size, len(text))
    while start_idx < len(text):
        chunks.append(text[start_idx:end_idx])
        start_idx += chunk_size - overlap
        end_idx = min(start_idx + chunk_size, len(text))
    return chunks

def chunk_documents(df, text_column='content', chunk_size=500, overlap=50):
    """Chunk the documents in a DataFrame."""
    df['chunks'] = df[text_column].apply(lambda x: chunk_document(x, chunk_size, overlap))
    return df

# Chunk the documents in the DataFrame
chunk_size = 500  # Set your desired chunk size
overlap = 50  # Set the overlap size
chunked_df = chunk_documents(cleaned_df, text_column='clean_text_lemma', chunk_size=chunk_size, overlap=overlap)

# Display the first few rows of the chunked DataFrame
print(chunked_df.head())


   SubtitleID_index  subtitle_id  \
0                 5      9180607   
1                 6      9180608   
2                 9      9180694   
3                10      9180695   
4                11      9180696   

                                                name  \
0                            the.myth.(2005).eng.1cd   
1                    the.great.beauty.(2013).eng.1cd   
2  rudrabinar.obhishaap.s02.e03.anandagarher.akhh...   
3  rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd   
4  rudrabinar.obhishaap.s02.e05.saat.surer.mejaj....   

                                             content  \
0   general the princesss convoy has entered our ...   
1   apiopensubtitlesorg is deprecated please impl...   
2   so youre assuming that my grandma mumtaz  is ...   
3   you know that naads have less patience  who s...   
4   use the free code joinnow at wwwplayshipseu  ...   

                                    clean_text_lemma  text_length_lemma  \
0  general the princess convoy ha 

In [None]:
chunked_df.head()

Unnamed: 0,SubtitleID_index,subtitle_id,name,content,clean_text_lemma,text_length_lemma,final_embeddings,chunks
0,10,9180695,rudrabinar.obhishaap.s02.e04.udara.(2022).eng.1cd,you know that naads have less patience who s...,you know that naads have le patience who smugg...,1359,"[-0.08960023522377014, 0.04438420385122299, 0....",[you know that naads have le patience who smug...
1,18,9181571,trying.s02.e02.the.sun.on.your.back.(2021).eng...,nikki sighs i dont even know why were doing ...,nikki sigh i dont even know why were doing thi...,3651,"[-0.05432497709989548, 0.16868433356285095, 0....",[nikki sigh i dont even know why were doing th...
2,20,9181573,trying.s02.e05.maddest.sweetest.thing.(2021).e...,jason bit of wildlife yes have you fastforwa...,jason bit of wildlife yes have you fastforward...,3611,"[0.1584937870502472, 0.2894313633441925, 0.541...",[jason bit of wildlife yes have you fastforwar...
3,23,9181722,aurelia.steiner.(vancouver).(1979).eng.1cd,support us and become vip member to remove al...,support u and become vip member to remove all ...,2813,"[-0.19465576112270355, -0.06905709207057953, 0...",[support u and become vip member to remove all...
4,26,9181818,physical.s02.e08.dont.you.run.and.hide.(2022)....,sheila ieyes forward face calm you deserve to...,sheila ieyes forward face calm you deserve to ...,3816,"[-0.11413861066102982, 0.1996752917766571, 0.4...",[sheila ieyes forward face calm you deserve to...


In [None]:
chunked_df.shape

(20663, 8)