In [1]:
!pip install boto3 beautifulsoup4



In [2]:
import xml.etree.ElementTree as ET

import boto3
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Stylesheet

In [3]:
PROMPT_BASE = """
你是一个专业的{$TARGET_LANGUAGE}翻译AI助手，现在需要你完成一项富文本翻译任务。这个任务涉及翻译HTML内容，要求你只翻译其中的自然语言文本，而保持代码和特殊格式不变。

请按照以下规则进行翻译：

1. 只翻译编号 <an> 中的自然语言文本，目标语言是：{$TARGET_LANGUAGE}
2. 不要翻译或修改任何变量名，例如 $variable #if #set 这种格式的变量名内容应保持原样。
3. 确保翻译后的文本语义准确，符合目标语言的语言习惯。

请将你的翻译结果放在 <translated_content> 标签内。

下面是一个简单的示例，展示了如何处理内容的翻译（假设目标语言是中文）：

输入：
<content>
<a0>Hello, $USERNAME! Welcome to our website.</a0>
</content>

输出：
<translated_content>
<a0>你好，$USERNAME！欢迎访问我们的网站。</a0>
</translated_content>

以下是需要翻译的内容：
{$HTML_CONTENT}

现在，请开始翻译上面提供的<content>内容。记住，只翻译自然语言文本，保持代码和特殊格式不变。
"""

TARGET_LANGUAGE = "中文"

In [4]:

def paring_readable_content(html_file, replace_dict={}):
    # 读取原始 HTML 文件
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()

    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(html_content, "html.parser")

    ret_document = {}

    # 定义一个函数来处理文本节点
    def process_text(text, counter):
        if text.strip():
            k = f"a{counter[0]}"
            if k in replace_dict:
                text = replace_dict[k]
            else:
                ret_document[k] = text
            counter[0] += 1
        return text

    # 遍历所有文本节点
    counter = [0]  # 使用列表作为可变对象来跟踪计数器

    for element in soup.find_all(text=True):
        if isinstance(element, (Comment, Stylesheet, Doctype)):
            # 跳过注释、样式表和文档类型声明
            continue

        if element.parent.get("style") == "display:none;":
            continue
        if isinstance(element, NavigableString) and element.strip():
            # print(element.parent)
            if element.parent.name not in [
                "script",
                "style",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                new_text = process_text(element, counter)

                element.replace_with(new_text)

    if not replace_dict:
        return ret_document
    else:
        # 生成新的 HTML 内容
        return soup.prettify()

In [5]:
def translate_part(content):
    # 创建根元素
    root = ET.Element("content")
    for idx, text in content.items():
        # 添加子元素
        child = ET.SubElement(root, str(idx))
        # child.text = "\n" + text + "\n"
        child.text = text

    # 创建 XML 树并写入文件
    tree = ET.ElementTree(root)
    ET.indent(root, space="", level=0)
    text = ET.tostring(root).decode("utf-8")

    prompts = PROMPT_BASE.replace("{$HTML_CONTENT}", text)
    prompts = prompts.replace("{$TARGET_LANGUAGE}", TARGET_LANGUAGE)
    # print(prompts)

    prefill = """<translate_content>"""

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "text": prompts,
                },
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"text": prefill},
            ],
        },
    ]

    bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name="us-west-2",
    )
    response = bedrock_runtime.converse(
        messages=messages,
        inferenceConfig={
            "temperature": 0,
            "maxTokens": 4096,
        },
        modelId="anthropic.claude-3-haiku-20240307-v1:0",
    )
    result = prefill + response["output"]["message"]["content"][0]["text"]
    # print(result)
    result = ET.fromstring(result)
    translated = {child.tag: child.text for child in result}
    return translated


def translate(html_content, max_length=2000):
    length = 0
    partial_content = {}
    translated = {}
    print(f"Total {len(html_content)} elements")
    # slice html content and translate
    for idx, text in html_content.items():
        length += len(text)
        if length > max_length:
            translated.update(translate_part(partial_content))
            length = 0
            partial_content = {}
        partial_content[idx] = text
    else:
        translated.update(translate_part(partial_content))
    return translated


In [6]:
html_file = "aws.html"
output_file = html_file.replace(".html", "_translate.html")

# parsing html
html_content = paring_readable_content(html_file)
# translate
translated = translate(html_content)
# replace back
translated_html = paring_readable_content(html_file, translated)

open(output_file, "w").write(translated_html)
print(f"Translated html saved in {output_file}")

  for element in soup.find_all(text=True):


translated html saved in aws_translate.html
