### Processing SRT file

In [6]:
import pandas as pd
import os
import pyarrow.parquet as pq

In [11]:
import os
import re
import pandas as pd

def is_chinese(text):
    """
    Checks if the text contains any Chinese characters.

    Args:
        text (str): Text to check.

    Returns:
        bool: True if Chinese characters are found, False otherwise.
    """
    return bool(re.search(r'[\u4e00-\u9fff]', text))

def is_english(text):
    """
    Checks if the text contains predominantly English characters.

    Args:
        text (str): Text to check.

    Returns:
        bool: True if mostly English characters are found, False otherwise.
    """
    # This regex checks for the presence of English letters and common punctuation.
    return bool(re.fullmatch(r'[A-Za-z0-9\s.,!?:\'\"-]*', text))

def parse_srt(file_path):
    """
    Parses an SRT file and extracts subtitle information.

    Args:
        file_path (str): Path to the SRT file.

    Returns:
        pd.DataFrame: DataFrame containing chinese, and english columns.
    """
    try:
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            content = f.read()
    except UnicodeDecodeError as e:
        print(f"Encoding error in file {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

    # Split the content into blocks separated by two newlines
    blocks = content.strip().split('\n\n')
    data = []

    for block in blocks:
        lines = block.strip().split('\n')

        # Ensure the block has at least 4 lines: index, time, chinese, english
        if len(lines) >= 4:
            try:
                chinese = lines[2].strip()
                english = lines[3].strip()
                
                # Language Checks
                chinese_valid = is_chinese(chinese)
                english_valid = is_english(english)
                
                
                # skip blocks where language checks fail
                if not (chinese_valid and english_valid):
                    print(f"Skipping block due to language mismatch:\n{block}")
                    continue
                
                data.append({
                    'chinese': chinese,
                    'english': english
                })
            except ValueError as ve:
                print(f"Value error parsing block:\n{block}\nError: {ve}")
            except Exception as e:
                print(f"Unexpected error parsing block:\n{block}\nError: {e}")
        else:
            print(f"Skipping incomplete block:\n{block}")

    return pd.DataFrame(data)

def srt_to_parquet(srt_file, parquet_file):
    """
    Converts an SRT file to a Parquet file with matching Chinese and English subtitles.

    Args:
        srt_file (str): Path to the input SRT file.
        parquet_file (str): Path to the output Parquet file.
    """
    df = parse_srt(srt_file)
    if not df.empty:
        try:
            df.to_parquet(parquet_file, index=False)
            print(f"Successfully converted {srt_file} to {parquet_file}")
        except Exception as e:
            print(f"Error saving Parquet file {parquet_file}: {e}")
    else:
        print(f"No data to save for file {srt_file}")


In [None]:
import os

srt_folder = 'subtitle_srt'
target_folder = 'subtitle_parquet'

# Create target folder if it doesn't exist
os.makedirs(target_folder, exist_ok=True)

# Process all .srt files in the srt_folder
for file in os.listdir(srt_folder):
    if file.endswith(".srt"):
        print(f"Processing {file}")
        srt_file = os.path.join(srt_folder, file)
        parquet_file = os.path.join(target_folder, file.replace('.srt', '.parquet'))
        try:
            srt_to_parquet(srt_file, parquet_file)
        except Exception as e:
            print(f"Error processing {srt_file}: {e}")



Skipping incomplete block:
1
00:01:31,120 --> 00:01:33,330
（回波探头）
Skipping incomplete block:
2
00:01:36,580 --> 00:01:39,200
（MU/TH/UR 9001苏醒中）
Skipping incomplete block:
3
00:01:41,790 --> 00:01:42,540
（2142年2月9日）
Skipping incomplete block:
4
00:01:42,620 --> 00:01:43,540
（位置：网罟座泽塔二星系统）
Skipping incomplete block:
5
00:01:43,790 --> 00:01:45,950
（任务目标 范围内）
Skipping incomplete block:
6
00:02:14,950 --> 00:02:18,120
（拦截倒数）
Skipping incomplete block:
7
00:02:33,000 --> 00:02:34,870
（威兰企业）
Skipping incomplete block:
8
00:02:48,410 --> 00:02:49,450
（目标已取得）
Skipping incomplete block:
9
00:02:49,700 --> 00:02:50,950
（返回程序已启动）
Skipping incomplete block:
10
00:04:36,660 --> 00:04:43,660
{\an8}《异形：罗穆路斯》
Skipping incomplete block:
11
00:04:36,660 --> 00:04:43,660
本WEB版外挂字幕由  风吹来的那片云  双语合并
Skipping block due to language mismatch:
15
00:05:29,080 --> 00:05:32,330
如果出现任何症状 如发烧 咳嗽
If you are experiencing any symptoms, such as fever, cough…
Skipping block due to language mismatch:
16
00:05:32,500 --> 

### Merge all parquet files in target_folder into one parquet file

In [13]:


dfs = []
for file in os.listdir(target_folder):
    if file.endswith(".parquet"):
        file_path = os.path.join(target_folder, file)
        df = pd.read_parquet(file_path)
        dfs.append(df)
        
merged_df = pd.concat(dfs, ignore_index=True)
merged_file = os.path.join('merged_dataSet.parquet')
merged_df.to_parquet(merged_file, index=False)


### Read the merged parquet file and get the first 5 rows

In [10]:
### read the merged parquet file and get the first 5 rows
df = pd.read_parquet(merged_file)
print(f"length of the dataset: {len(df)}")
df.head()



length of the dataset: 12842


Unnamed: 0,chinese,english
0,全体员工注意…,Attention all workers. Attention all workers.
1,日班将在15分钟后开始,Day shift starting in T-minus 15 minutes.
2,农民立刻至食堂报到,Farmers to report to mess hall immediately.
3,请至医疗区检查,proceed to the medical bay for examination.
4,殖民地安全与福祉是威兰的第一考量,The safety and well-being of our colony is Wey...
