In [None]:
import xml.etree.ElementTree as ET
import os

def parse_tmx(file):
    # 解析 XML 文件
    tree = ET.parse(file)
    root = tree.getroot()

    # 找到 TMX 文件中的 body 部分
    body = root.find("body")
    if body is None:
        raise ValueError("Invalid TMX file: missing <body> section.")

    # 遍历所有 <tu> 元素
    for tu in body.findall("tu"):
        translations = {}
        # 提取每个 <tu> 中的 <tuv> 元素
        for tuv in tu.findall("tuv"):
            lang = tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")  # 获取语言属性
            seg = tuv.find("seg")
            if lang and seg.text is not None:
                translations[lang] = seg.text.strip()

        # 如果有源语言和目标语言，返回一对
        if "en-US" in translations and "zh-CN" in translations:
            yield translations["en-US"], translations["zh-CN"]


def list_tmx_files(folder_path):
    # 遍历文件夹及其子文件夹中的所有文件
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.tmx') and file[0].upper() in {'A', 'B', 'C'}:  # 只处理以 .txt 结尾的文件
                file_path = os.path.join(root, file)
                yield file_path  # 返回文件的完整路径

def write_lines_to_txt(file_path, lines):
    with open(file_path, 'w', encoding='utf-8') as file:
        for line in lines:
            file.write(line + '\n')  # 在每个字符串后加换行符



In [None]:
folder_path = "path/to/your/tmx/files"
for tmx_file in list_tmx_files(folder_path):
  tmp = []
  for en, zh in parse_tmx(tmx_file):
    tmp.append(en)
  if len(tmp) == 0:
    continue
  tmp.pop(0)
  write_lines_to_txt("path/to/your/output/folder/" + str(tmx_file.rsplit('/', 1)[-1].rsplit('.', 1)[0]) + "_en.txt", tmp)
    