### Processing SRT file

In [12]:
import pandas as pd
import os
import pyarrow.parquet as pq

In [13]:


def parse_srt(file_path):
    """
    Parses an SRT file and extracts subtitle information.

    Args:
        file_path (str): Path to the SRT file.

    Returns:
        pd.DataFrame: DataFrame containing index, start_time, end_time, chinese, and english columns.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split the content into blocks separated by two newlines
    blocks = content.strip().split('\n\n')
    data = []
    
    for block in blocks:
        lines = block.strip().split('\n')
        
        # Ensure the block has at least 4 lines: index, time, chinese, english
        if len(lines) >= 4:
            try:
                index = int(lines[0].strip())
                times = lines[1].strip()
                start_time, end_time = times.split(' --> ')
                chinese = lines[2].strip()
                english = lines[3].strip()
                
                data.append({
                    'index': index,
                    'start_time': start_time,
                    'end_time': end_time,
                    'chinese': chinese,
                    'english': english
                })
            except Exception as e:
                print(f"Error parsing block:\n{block}\nError: {e}")
    
    return pd.DataFrame(data)

def srt_to_parquet(srt_file, parquet_file):
    """
    Converts an SRT file to a Parquet file with matching Chinese and English subtitles.

    Args:
        srt_file (str): Path to the input SRT file.
        parquet_file (str): Path to the output Parquet file.
    """
    df = parse_srt(srt_file)
    df.to_parquet(parquet_file, index=False)
    print(f"Successfully converted {srt_file} to {parquet_file}")



In [14]:
srt_folder = 'subtitle'
#travel all files in the folder and process all srt files
target_folder = 'subtitle_parquet'
for file in os.listdir(srt_folder):
    if file.endswith(".srt"):
        srt_file = os.path.join(srt_folder, file)
        parquet_file = os.path.join(target_folder, file.replace('.srt', '.parquet'))
        srt_to_parquet(srt_file, parquet_file)
        


In [16]:
# merge all parquet files in target_folder into one parquet file

dfs = []
for file in os.listdir(target_folder):
    if file.endswith(".parquet"):
        file_path = os.path.join(target_folder, file)
        df = pd.read_parquet(file_path)
        dfs.append(df)
