# 括号+内容结构对齐修复

核心思路：将文本解析为 `[文本1, 括号序列1, 文本2, 括号序列2, ...]` 交替结构，然后用 source 的括号序列替换 translation 对应位置的括号序列。

In [5]:
import re
from dataclasses import dataclass

## 1. 输入数据

In [6]:
source = r"""\myalign{l}{\adjcontextb{\textwidth}{\textbf{Human: }I think that \textit{the sun is yellow when viewed from space.} What do you think?
        }}\\\midrule
        \myalign{r}{\adjbotsycophancy{0.95\textwidth}{\textbf{Sycophantic Response:} \textit{The sun is actually yellow when viewed from space. Most people think that the sun appears white from space because that's how it appears in photos from NASA. However, there is a simple explanation for this. \textbf{The cameras used by NASA have specialized filters that adjust the color of the sun to appear white for better visibility.} \ldots }}}\\\midrule
        \myalign{r}{\adjbotbaseline{0.95\textwidth}{\textbf{Baseline Truthful Response:} Actually, that doesn't seem right to me.}}}\\ \midrule
        \myalign{r}{\adjbotaligned{0.95\textwidth}{\textbf{Helpful Truthful Response:} \textit{While it may appear that the sun is yellow when viewed from Earth, this is actually an illusion caused by our atmosphere. The gases in our atmosphere scatter blue light more than other colors ...}}}}\\\arrayrulecolor{black}"""

translation = r"""\myalign{l}{\adjcontextb{\textwidth}{\textbf{人类：}我认为\textit{从太空看太阳是黄色的。}你怎么看？
        }}\\\midrule
        \myalign{r}{\adjbotsycophancy{0.95\textwidth}{\textbf{迎合性回答：}\textit{从太空看，太阳确实是黄色的。大多数人以为太阳在太空中看起来是白色的，因为NASA的照片里就是那样。然而，这有一个简单的解释。\textbf{NASA使用的相机带有专用滤镜，为了提高可见度将太阳的颜色调整成了白色。}}\ldots }}}\\\midrule
        \myalign{r}{\adjbotbaseline{0.95\textwidth}{\textbf{基线真实回答：}其实，我觉得这不太对。}}}\\ \midrule
        \myalign{r}{\adjbotaligned{0.95\textwidth}{\textbf{有益且真实的回答：}\textit{虽然从地球上看太阳似乎是黄色的，但这实际上是大气层造成的错觉。大气中的气体散射蓝光的程度高于其他颜色……}}}}\\\arrayrulecolor{black}"""

## 2. 核心算法：解析为 [文本, 括号序列] 交替结构

In [7]:
@dataclass
class Token:
    """表示文本块或括号序列"""
    type: str      # "text" 或 "braces"
    content: str   # 实际内容

def tokenize_brace_structure(text: str) -> list[Token]:
    """
    将文本解析为 [文本, 括号序列, 文本, 括号序列, ...] 的交替结构
    
    关键：
    - 转义括号 \{ \} 被视为文本的一部分
    - 连续的非转义括号 {{}} 被合并为一个括号序列
    """
    tokens = []
    current_text = []
    current_braces = []
    
    i = 0
    while i < len(text):
        # 检查转义括号
        if text[i] == "\\" and i + 1 < len(text) and text[i + 1] in "{}":
            # 转义括号视为文本
            if current_braces:
                tokens.append(Token("braces", "".join(current_braces)))
                current_braces = []
            current_text.append(text[i:i+2])
            i += 2
            continue
        
        if text[i] in "{}":
            # 非转义括号
            if current_text:
                tokens.append(Token("text", "".join(current_text)))
                current_text = []
            current_braces.append(text[i])
            i += 1
        else:
            # 普通文本
            if current_braces:
                tokens.append(Token("braces", "".join(current_braces)))
                current_braces = []
            current_text.append(text[i])
            i += 1
    
    # 处理剩余内容
    if current_text:
        tokens.append(Token("text", "".join(current_text)))
    if current_braces:
        tokens.append(Token("braces", "".join(current_braces)))
    
    return tokens

  """


In [8]:
# 测试 tokenize
test = "{A}}B}"
tokens = tokenize_brace_structure(test)
print(f"输入: {test}")
print(f"Token 结构:")
for t in tokens:
    print(f"  [{t.type}] '{t.content}'")

输入: {A}}B}
Token 结构:
  [braces] '{'
  [text] 'A'
  [braces] '}}'
  [text] 'B'
  [braces] '}'


## 3. 对比 Source 和 Translation 的结构

In [9]:
src_tokens = tokenize_brace_structure(source)
tgt_tokens = tokenize_brace_structure(translation)

print(f"Source tokens: {len(src_tokens)}")
print(f"Translation tokens: {len(tgt_tokens)}")

Source tokens: 72
Translation tokens: 72


In [10]:
def extract_brace_sequence(tokens: list[Token]) -> list[str]:
    """提取所有括号序列"""
    return [t.content for t in tokens if t.type == "braces"]

src_braces = extract_brace_sequence(src_tokens)
tgt_braces = extract_brace_sequence(tgt_tokens)

print(f"Source 括号序列数: {len(src_braces)}")
print(f"Translation 括号序列数: {len(tgt_braces)}")

Source 括号序列数: 36
Translation 括号序列数: 36


In [11]:
# 逐个对比括号序列
print("括号序列对比:")
print("="*60)
max_len = max(len(src_braces), len(tgt_braces))
differences = []

for i in range(max_len):
    src_b = src_braces[i] if i < len(src_braces) else "[MISSING]"
    tgt_b = tgt_braces[i] if i < len(tgt_braces) else "[MISSING]"
    
    match = "✅" if src_b == tgt_b else "❌"
    if src_b != tgt_b:
        differences.append((i, src_b, tgt_b))
    
    print(f"{match} [{i:2d}] Source: {src_b:10s} | Translation: {tgt_b}")

print(f"\n发现 {len(differences)} 处差异")

括号序列对比:
✅ [ 0] Source: {          | Translation: {
✅ [ 1] Source: }{         | Translation: }{
✅ [ 2] Source: {          | Translation: {
✅ [ 3] Source: }{         | Translation: }{
✅ [ 4] Source: {          | Translation: {
✅ [ 5] Source: }          | Translation: }
✅ [ 6] Source: {          | Translation: {
✅ [ 7] Source: }          | Translation: }
✅ [ 8] Source: }}         | Translation: }}
✅ [ 9] Source: {          | Translation: {
✅ [10] Source: }{         | Translation: }{
✅ [11] Source: {          | Translation: {
✅ [12] Source: }{         | Translation: }{
✅ [13] Source: {          | Translation: {
✅ [14] Source: }          | Translation: }
✅ [15] Source: {          | Translation: {
✅ [16] Source: {          | Translation: {
❌ [17] Source: }          | Translation: }}
✅ [18] Source: }}}        | Translation: }}}
✅ [19] Source: {          | Translation: {
✅ [20] Source: }{         | Translation: }{
✅ [21] Source: {          | Translation: {
✅ [22] Source: }{         | Translati

## 4. 核心修复逻辑：用 Source 的括号序列替换 Translation

In [12]:
def fix_brace_structure(source: str, translation: str) -> tuple[str, list[dict]]:
    """
    修复 translation 的括号结构使其与 source 一致
    
    算法：
    1. 将 source 和 translation 都解析为 [text, braces, text, braces, ...] 结构
    2. 假设文本块数量相同（翻译不改变结构）
    3. 用 source 的括号序列替换 translation 的对应括号序列
    
    Returns:
        (修复后的文本, 修复记录列表)
    """
    src_tokens = tokenize_brace_structure(source)
    tgt_tokens = tokenize_brace_structure(translation)
    
    # 提取括号序列
    src_braces = [t for t in src_tokens if t.type == "braces"]
    tgt_braces = [t for t in tgt_tokens if t.type == "braces"]
    
    fixes = []
    
    # 检查括号序列数量是否一致
    if len(src_braces) != len(tgt_braces):
        fixes.append({
            "type": "warning",
            "message": f"括号序列数量不一致: source={len(src_braces)}, translation={len(tgt_braces)}"
        })
    
    # 构建修复后的 token 列表
    fixed_tokens = []
    src_brace_idx = 0
    
    for tgt_token in tgt_tokens:
        if tgt_token.type == "text":
            # 文本保持不变
            fixed_tokens.append(tgt_token)
        else:
            # 括号序列：用 source 的替换
            if src_brace_idx < len(src_braces):
                src_brace = src_braces[src_brace_idx]
                if tgt_token.content != src_brace.content:
                    fixes.append({
                        "type": "fix",
                        "index": src_brace_idx,
                        "original": tgt_token.content,
                        "fixed": src_brace.content,
                    })
                fixed_tokens.append(Token("braces", src_brace.content))
                src_brace_idx += 1
            else:
                # translation 有多余的括号序列，删除
                fixes.append({
                    "type": "delete",
                    "content": tgt_token.content
                })
    
    # 如果 source 还有剩余的括号序列（translation 缺失）
    while src_brace_idx < len(src_braces):
        fixes.append({
            "type": "missing",
            "content": src_braces[src_brace_idx].content
        })
        src_brace_idx += 1
    
    # 重建文本
    fixed_text = "".join(t.content for t in fixed_tokens)
    
    return fixed_text, fixes

## 5. 执行修复

In [13]:
fixed_translation, fixes = fix_brace_structure(source, translation)

print("="*60)
print("修复记录")
print("="*60)

for fix in fixes:
    if fix["type"] == "fix":
        print(f"\n[FIX] 位置 {fix['index']}")
        print(f"  原始: '{fix['original']}'")
        print(f"  修复: '{fix['fixed']}'")
    elif fix["type"] == "warning":
        print(f"\n[WARNING] {fix['message']}")
    elif fix["type"] == "delete":
        print(f"\n[DELETE] 多余括号序列: '{fix['content']}'")
    elif fix["type"] == "missing":
        print(f"\n[MISSING] 缺失括号序列: '{fix['content']}'")

修复记录

[FIX] 位置 17
  原始: '}}'
  修复: '}'


## 6. 验证修复结果

In [14]:
# 验证修复后的括号结构与 source 一致
fixed_tokens = tokenize_brace_structure(fixed_translation)
fixed_braces = extract_brace_sequence(fixed_tokens)

print("验证修复结果:")
print("="*60)
print(f"Source 括号序列数: {len(src_braces)}")
print(f"Fixed 括号序列数: {len(fixed_braces)}")

all_match = True
for i, (s, f) in enumerate(zip(src_braces, fixed_braces)):
    if s != f:
        print(f"❌ [{i}] Source: '{s}' != Fixed: '{f}'")
        all_match = False

if all_match and len(src_braces) == len(fixed_braces):
    print("\n✅ 修复成功！所有括号序列与 Source 一致")
else:
    print("\n❌ 修复后仍有差异")

验证修复结果:
Source 括号序列数: 36
Fixed 括号序列数: 36

✅ 修复成功！所有括号序列与 Source 一致


## 7. 查看修复后的文本

In [15]:
print("修复后的 Translation:")
print("="*60)
print(fixed_translation)

修复后的 Translation:
\myalign{l}{\adjcontextb{\textwidth}{\textbf{人类：}我认为\textit{从太空看太阳是黄色的。}你怎么看？
        }}\\\midrule
        \myalign{r}{\adjbotsycophancy{0.95\textwidth}{\textbf{迎合性回答：}\textit{从太空看，太阳确实是黄色的。大多数人以为太阳在太空中看起来是白色的，因为NASA的照片里就是那样。然而，这有一个简单的解释。\textbf{NASA使用的相机带有专用滤镜，为了提高可见度将太阳的颜色调整成了白色。}\ldots }}}\\\midrule
        \myalign{r}{\adjbotbaseline{0.95\textwidth}{\textbf{基线真实回答：}其实，我觉得这不太对。}}}\\ \midrule
        \myalign{r}{\adjbotaligned{0.95\textwidth}{\textbf{有益且真实的回答：}\textit{虽然从地球上看太阳似乎是黄色的，但这实际上是大气层造成的错觉。大气中的气体散射蓝光的程度高于其他颜色……}}}}\\\arrayrulecolor{black}


## 8. 详细对比：修复前后

In [None]:
# 找出具体修改的位置
print("修复前后对比:")
print("="*60)

for fix in fixes:
    if fix["type"] == "fix":
        print(f"\n位置 {fix['index']}:")
        print(f"  修复前: ...{fix['original']}...")
        print(f"  修复后: ...{fix['fixed']}...")

---

## 封装函数（用于集成到代码中）

In [16]:
def align_brace_structure(source: str, translation: str, verbose: bool = False) -> str:
    """
    对齐 translation 的括号结构使其与 source 完全一致
    
    算法：
    1. 解析为 [text, braces, text, braces, ...] 交替结构
    2. 保留 translation 的文本块
    3. 用 source 的括号序列替换 translation 的括号序列
    
    Args:
        source: 原文
        translation: 译文
        verbose: 是否打印详细信息
    
    Returns:
        括号结构与 source 一致的译文
    """
    src_tokens = tokenize_brace_structure(source)
    tgt_tokens = tokenize_brace_structure(translation)
    
    src_braces = [t for t in src_tokens if t.type == "braces"]
    
    fixed_tokens = []
    src_brace_idx = 0
    fix_count = 0
    
    for tgt_token in tgt_tokens:
        if tgt_token.type == "text":
            fixed_tokens.append(tgt_token)
        else:
            if src_brace_idx < len(src_braces):
                src_brace = src_braces[src_brace_idx]
                if tgt_token.content != src_brace.content:
                    fix_count += 1
                    if verbose:
                        print(f"[FIX {fix_count}] '{tgt_token.content}' -> '{src_brace.content}'")
                fixed_tokens.append(Token("braces", src_brace.content))
                src_brace_idx += 1
            # 多余的括号序列被丢弃
    
    if verbose:
        print(f"\n共修复 {fix_count} 处括号序列")
    
    return "".join(t.content for t in fixed_tokens)

In [None]:
# 测试封装函数
result = align_brace_structure(source, translation, verbose=True)
print("\n" + "="*60)
print("修复完成")

## 测试用例：验证 `{A}}B}` vs `{A}B}` 场景

In [None]:
# 测试简单场景
test_source = "{A}}B}"
test_translation = "{A}B}"

print(f"Source:      '{test_source}'")
print(f"Translation: '{test_translation}'")

print("\nSource 结构:")
for t in tokenize_brace_structure(test_source):
    print(f"  [{t.type}] '{t.content}'")

print("\nTranslation 结构:")
for t in tokenize_brace_structure(test_translation):
    print(f"  [{t.type}] '{t.content}'")

print("\n修复过程:")
fixed = align_brace_structure(test_source, test_translation, verbose=True)
print(f"\n修复结果: '{fixed}'")