In [17]:
from pathlib import Path
import json
import os

In [18]:
# find duplicate segments
def find_segments(lst: list, time: int, interval: int = 100) -> list:
    result = []
    for seg in lst:
        if abs(seg["tStartMs"] - time) <= interval:
            result.append(seg["segs"][0]["utf8"])
    return result

def create_translation_pairs(source_path, target_path, overlap_time_window: int = 100) -> list:
    source_events = json.load(open(source_path, "r"))["events"]
    target_events = json.load(open(target_path, "r"))["events"]
    translation_pairs = []
    for event in source_events:
        tStartMs = event["tStartMs"]
        target_text = find_segments(target_events, tStartMs, overlap_time_window)
        if target_text != []:
            pair = {
                "en": target_text,
                "th": event["segs"][0]["utf8"]
            }
            translation_pairs.append(pair)
    return translation_pairs


In [19]:
# create sample pair
result = create_translation_pairs("data/th-en-subtitles/@Bearhugsk/D_PB8geeQRE/th.json3", "data/th-en-subtitles/@Bearhugsk/D_PB8geeQRE/en.json3", 500)
len(result)

224

In [20]:
# select only sentences pair with multiple translations
multi_translation = []
multi_translation_num = 0
for r in result:
    if len(r["en"]) > 1:
        multi_translation.append(r)

multi_translation

[{'en': ['Intense flavors.',
   '- Really intense, but tasty.\n- Intense flavors of Ko Bua.'],
  'th': '- เข้มข้นมาก รสชาติที่นี่ร้านกอบัว \n- เข้มข้นมาก แต่อร่อยนะ'},
 {'en': ['[She keeps ordering non-stop.]',
   'Let’s try.\n[She keeps ordering non-stop.]'],
  'th': 'ลองดูเนอะ'}]

# Run on multiple file

In [33]:
subtitle_path = Path("data/th-en-subtitles")
video_ids = list(subtitle_path.glob("*/*"))
len(video_ids)

528

In [31]:
# languages = ["th", "en"]
en_substitutes = ["en-US", "en-GB"]
translation_pairs = []
for video_id in video_ids:
    # folder_path = subtitle_path.joinpath(video_id)
    th_file_path = video_id.joinpath("th.json3")
    en_file_path = video_id.joinpath("en.json3")
    # if en is not available it may exist as en-US etc.
    if not os.path.isfile(en_file_path):
        for sub in en_substitutes:
            en_file_path = video_id.joinpath(f"{sub}.json3")
            if os.path.isfile(en_file_path):
                break
    
    translation_pairs += create_translation_pairs(th_file_path, en_file_path, 300)

len(translation_pairs)

37592

In [36]:
translation_pairs[:10]

[{'en': ["We're left abandoned in the middle of the desert"],
  'th': 'เราถูกทิ้งกลางทะเลทราย'},
 {'en': ['sand,'], 'th': 'ทราย!!'},
 {'en': ['and SAND everywhere!'], 'th': 'แล้วก็ทรายรอบตัว'},
 {'en': ["It's blazing hot in the afternoon,"],
  'th': 'กลางวันโคตรร้อนถึง 50 องศา'},
 {'en': ['and extremely cold at night!'], 'th': 'กลางคืนก็หนาวสุด ๆ'},
 {'en': ["Let's see if we can survive this extreme survival in the desert challenge"],
  'th': 'มาดูกันว่าเราจะรอดพ้น'},
 {'en': ["Before we start, don't forget to subscribe to our channel!"],
  'th': 'และก่อนเริ่มอย่าลืมกดไลก์ กดติดตาม'},
 {'en': ['We gotta choose where to set up the camp'],
  'th': 'เราต้องเลือกว่าเราจะตั้งแคมป์ที่ไหน'},
 {'en': ['Yesterday when we play Last-to-Leave-the-Circle Challenge'],
  'th': 'เมื่อวานตอนที่เราแข่งอยู่ในกรอบ'},
 {'en': ['we were on top of the hill.'], 'th': 'เราอยู่ข้างบนเนิน'}]

In [37]:
# select only sentences pair with multiple translations
multi_translation = []
multi_translation_num = 0
for r in translation_pairs:
    if len(r["en"]) > 1:
        multi_translation.append(r)

len(multi_translation)

35

In [39]:
multi_translation[:10]

[{'en': ['What?', 'I just saw something run downhill.'],
  'th': 'ผมเห็นมีตัวอะไรวิ่งไปเมื่อกี้เนี่ย'},
 {'en': ['(laughs)', 'What was that?'], 'th': 'อะไรวะ'},
 {'en': ['(laughs)', 'We found him!'], 'th': '- เจอแล้ว\n- ขาอ่อนเลย'},
 {'en': ['Ah. Right.',
   '- Or how to make it represent BEARHOUSE, how to make it beautiful,\n- Ah.'],
  'th': 'ให้มันเป็น BEARHOUSE ให้มันสวย'},
 {'en': ['Aha.',
   "Let's go to that area instead. From the farthest right to the farthest left area."],
  'th': 'เราไปมุมนู้นก่อนไหม'},
 {'en': ['(chuckles)', "I'm scared"], 'th': 'ไม่ กูกลัว'},
 {'en': ['Yeah.', "Yeah. Let's go there."], 'th': '- ไป\n- ไปศาลาก่อน ไปจูน'},
 {'en': ['Yes', 'So, there are spicy food, fresh vegetables'], 'th': 'ค่ะ'},
 {'en': ['(laughs)', 'What a copycat.'], 'th': 'ขี้ลอกว่ะ ทำก่อนเรา'},
 {'en': ['What?', 'Servo'], 'th': 'อะไรนะ'}]