In [None]:
# Required imports
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Load the dataset
path = r'/content/drive/MyDrive/search_engine/files/eng_subtitles.parquet'
df = pd.read_parquet(path)

In [None]:
df.head(2)

Unnamed: 0,num,name,content
0,9251120,maybe.this.time.(2014).eng.1cd,Watch any video online with OpenSUBTITLES Free...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,Oh I know that its getting late but I dont wan...


In [None]:
def chunk_documents_langchain(df, chunk_size=512, chunk_overlap=100):
    """Chunks documents using Langchain and returns a new DataFrame with metadata."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    chunks = []
    metadatas = []
    for index, row in df.iterrows():
        doc_chunks = text_splitter.split_text(row["content"])
        chunks.extend(doc_chunks)
        # Store subtitle_id and subtitle_name in the metadata
        metadatas.extend([{"original_index": index,
                          "subtitle_id": row["num"],
                          "subtitle_name": row["name"]}] * len(doc_chunks))

    chunk_df = pd.DataFrame({"chunk": chunks, "metadata": metadatas})
    return chunk_df

In [None]:
# Example usage (assuming 'new_df' has a 'content' column):
chunked_df = chunk_documents_langchain(df)

# Print the resulting DataFrame:
print(chunked_df.head(2))

                                               chunk  \
0  Watch any video online with OpenSUBTITLES Free...   
1  on Keep dancing Whatever Im kidding Dont get m...   

                                            metadata  
0  {'original_index': 0, 'subtitle_id': 9251120, ...  
1  {'original_index': 0, 'subtitle_id': 9251120, ...  


In [None]:
chunked_df.head()

Unnamed: 0,chunk,metadata
0,Watch any video online with OpenSUBTITLES Free...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
1,on Keep dancing Whatever Im kidding Dont get m...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
2,And you Douche Handsome Conceited Just like yo...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
3,that so Yes How long will this program run If ...,"{'original_index': 0, 'subtitle_id': 9251120, ..."
4,her Gramps Uncle Erning Aunt Elma this is Tep ...,"{'original_index': 0, 'subtitle_id': 9251120, ..."


In [None]:
# Now save the chunked dataset
try:
  path = r'/content/drive/MyDrive/search_engine/files/'
  chunked_df.to_parquet(f"{path}subtitles_extracted.parquet")
  print("Successfully saved the dataset.")
except Exception as e:
  print(e)

Successfully saved the dataset.
