In [6]:
# functions and data structures for feature extraction
from collections import defaultdict
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm
import json
import random

def tree():
    return defaultdict(tree)

def split_string(text):
    result = []
    tmp = ""
    flag = 0
    for char in text:
        if char == ' ':
            flag = 0
            if tmp != "":
                result.append(tmp)
                tmp = ""
            continue
        elif char == '(':
            flag = 1
            result.append(char)
            continue
        elif char == ')':
            result.append(char)
            continue
        elif flag == 1:
            tmp += char
            continue
    return result, ''.join(result)

class Stack:
    def __init__(self):
        self.items = []

    def is_empty(self):
        return self.items == []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        if not self.is_empty():
            return self.items.pop()
        else:
            print("Stack is empty")

    def top(self):
        if not self.is_empty():
            return self.items[-1]
        else:
            print("Stack is empty")

    def size(self):
        return len(self.items)


def dicts(t): return {k: dicts(t[k]) for k in t}

def build_feature_tree(text, MAXLAYER=3):
    treeStrList, tmp = split_string(text)
    featyreTree = tree()
    stack = Stack()
    i = 0
    layer = -1
    while i < len(treeStrList):
        if treeStrList[i] == '(':
            i+=1
            layer+=1
            if layer <= MAXLAYER:
                if(stack.is_empty()):
                    featyreTree[treeStrList[i]]
                    stack.push(featyreTree[treeStrList[i]])
                else:
                    if(treeStrList[i] in stack.top()):
                        keytmp = treeStrList[i] + str(random.randint(1, 100000))
                    else:
                        keytmp = treeStrList[i]
                    stack.top()[keytmp]
                    stack.push(stack.top()[keytmp])
            i+=1
            continue
        if treeStrList[i] == ')':
            if layer > 0 and layer <= MAXLAYER:
                stack.pop()
            layer-=1
            i+=1
            continue
    return dicts(featyreTree)['ROOT']

def extract_constituency_feature_from_tree(feature_tree):
    queue = []
    result = []
    queue.append(feature_tree)
    while len(queue) > 0:
        tmp = queue.pop()
        for key in tmp.keys():
            if isinstance(tmp[key], dict) and len(tmp[key]) > 0:
                queue.append(tmp[key])
                # 去除数字
                result.append(''.join(char for char in str({key: tmp[key]}) if not char.isdigit()))
    return result

def extract_dependency_feature_from_list(dependency_list):
    result = []
    for dep in dependency_list:
        result.append(str(dep[0].upos) + '/' + str(dep[2].upos) + '/' + str(dep[1]))
    return result

def parse_tmx(file):
    # 解析 XML 文件
    tree = ET.parse(file)
    root = tree.getroot()

    # 找到 TMX 文件中的 body 部分
    body = root.find("body")
    if body is None:
        raise ValueError("Invalid TMX file: missing <body> section.")

    # 遍历所有 <tu> 元素
    for tu in body.findall("tu"):
        translations = {}
        # 提取每个 <tu> 中的 <tuv> 元素
        for tuv in tu.findall("tuv"):
            lang = tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")  # 获取语言属性
            seg = tuv.find("seg")
            if lang and seg is not None:
                translations[lang] = seg.text.strip()

        # 如果有源语言和目标语言，返回一对
        if "en-US" in translations and "zh-CN" in translations:
            yield translations["en-US"], translations["zh-CN"]

def list_txt_files(folder_path):
    result = []
    # 遍历文件夹及其子文件夹中的所有文件
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.txt'):  # 只处理以 .txt 结尾的文件
                file_path = os.path.join(root, file)
                result.append(file_path)  # 返回文件的完整路径
    return result

In [None]:
# stanza pipeline for constituency and dependency parsing
import stanza
nlp = stanza.Pipeline('zh')

In [3]:
feature_count_map = defaultdict(int)


In [4]:
feature_sentences_map = defaultdict(list)

In [5]:
print(list_txt_files("./instruct_txt"))

['./instruct_txt\\A01A_en_zh.txt', './instruct_txt\\A01B_en_zh.txt', './instruct_txt\\A02A_en_zh.txt', './instruct_txt\\A02B_en_zh.txt', './instruct_txt\\A02C_en_zh.txt', './instruct_txt\\A02D_en_zh.txt', './instruct_txt\\A03A_en_zh.txt', './instruct_txt\\A03B_en_zh.txt', './instruct_txt\\A04A_en_zh.txt', './instruct_txt\\A05A_en_zh.txt', './instruct_txt\\A07A_en_zh.txt', './instruct_txt\\A07B_en_zh.txt', './instruct_txt\\A08A_en_zh.txt', './instruct_txt\\A08B_en_zh.txt', './instruct_txt\\A09A_en_zh.txt', './instruct_txt\\A10A_en_zh.txt', './instruct_txt\\A11A_en_zh.txt', './instruct_txt\\A12A_en_zh.txt', './instruct_txt\\A13A_en_zh.txt', './instruct_txt\\A13B_en_zh.txt', './instruct_txt\\A14A_en_zh.txt', './instruct_txt\\A15B_en_zh.txt', './instruct_txt\\A16A_en_zh.txt', './instruct_txt\\A17A_en_zh.txt', './instruct_txt\\A18A_en_zh.txt', './instruct_txt\\A18B_en_zh.txt', './instruct_txt\\A18C_en_zh.txt', './instruct_txt\\A19A_en_zh.txt', './instruct_txt\\A19B_en_zh.txt', './instruct_t

In [None]:
ll = []
with open('./instruct_txt/A01A_en_zh.txt', "r", encoding="utf-8") as file:
      for line in file:
        ll.append(line)
        doc = nlp(line)
        m = str(doc.sentences[0].constituency)
        ll.append(m)
        result = build_feature_tree(m, 2)
        ll.append("layer 2:")
        ll.append(str(result))
        ff = extract_constituency_feature_from_tree(result)
        for item in ff:
            ll.append(item)
            feature_sentences_map[item].append(line)
        result = build_feature_tree(m, 3)
        ll.append("layer 3:")
        ll.append(str(result))
        ff = extract_constituency_feature_from_tree(result)
        for item in ff:
            ll.append(item)
            feature_sentences_map[item].append(line)
        ll.append("----------------------------------------------------------------------")
        with open(f'test.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(ll))

In [7]:
i = 0
for txt_file in tqdm(list_txt_files("./instruct_txt"), total=88, desc="Processing", unit="file"):
    ll = []
    with open(txt_file, "r", encoding="utf-8") as file:
      for line in file:
        ll.append(line)
        doc = nlp(line)
        m = str(doc.sentences[0].constituency)
        ll.append(m)
        result = build_feature_tree(m, 2)
        ff = extract_constituency_feature_from_tree(result)
        for item in ff:
            ll.append(item)
            feature_count_map[item] += 1
            feature_sentences_map[item].append(line)
        result = build_feature_tree(m, 3)
        ff = extract_constituency_feature_from_tree(result)
        for item in ff:
            ll.append(item)
            feature_count_map[item] += 1
            feature_sentences_map[item].append(line)
        ll.append("----------------------------------------------------------------------")
        with open(f'./tmp/feature{i}.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(ll))
      i+=1

Processing: 100%|██████████| 88/88 [11:02<00:00,  7.53s/file]


In [8]:
sorted_data = dict(sorted(feature_count_map.items(), key=lambda item: item[1], reverse=True))
with open('feature_count.json', 'w', encoding='utf-8') as f:
    json.dump(sorted_data, f, ensure_ascii=False, indent=4)

In [9]:
sorted_sentences_data = dict(sorted(feature_sentences_map.items(), key=lambda item: len(item[1]), reverse=True))
with open('feature_sentences.json', 'w', encoding='utf-8') as f:
    json.dump(sorted_sentences_data, f, ensure_ascii=False, indent=4)

In [12]:
text_book = ['(ROOT (IP (IP (NP (NNP 平壤)) (VP (MD 可能) (VP (ADVP (RB 也)) (ADVP (RB 在)) (VP (VV 准备) (IP (VP (VV 试射) (NP (QP (CD 一) (CLP (NNB 枚))) (NP (NP (FW Taepo)) (FW Dong-2)) (NP (NN 导弹))))))))) (, ，) (IP (LCP (NP (NN 理论)) (IN 上)) (, ，) (NP (DP (DT 这) (CLP (NNB 种))) (NP (NN 导弹))) (VP (MD 可以) (VP (VP (VV 携带) (NP (NN 核弹) (SFN 头))) (, ，) (VP (VV 覆盖) (NP (DP (DT 整个)) (NP (NNP 美国)) (NP (NN 大陆))))))) (. 。)))']

result = build_feature_tree(text_book[0], 2)

ff = extract_constituency_feature_from_tree(result)
for item in ff:
    print(item)



['(', 'ROOT', '(', 'IP', '(', 'IP', '(', 'NP', '(', 'NNP', ')', ')', '(', 'VP', '(', 'MD', ')', '(', 'VP', '(', 'ADVP', '(', 'RB', ')', ')', '(', 'ADVP', '(', 'RB', ')', ')', '(', 'VP', '(', 'VV', ')', '(', 'IP', '(', 'VP', '(', 'VV', ')', '(', 'NP', '(', 'QP', '(', 'CD', ')', '(', 'CLP', '(', 'NNB', ')', ')', ')', '(', 'NP', '(', 'NP', '(', 'FW', ')', ')', '(', 'FW', ')', ')', '(', 'NP', '(', 'NN', ')', ')', ')', ')', ')', ')', ')', ')', ')', '(', ')', '(', 'IP', '(', 'LCP', '(', 'NP', '(', 'NN', ')', ')', '(', 'IN', ')', ')', '(', ')', '(', 'NP', '(', 'DP', '(', 'DT', ')', '(', 'CLP', '(', 'NNB', ')', ')', ')', '(', 'NP', '(', 'NN', ')', ')', ')', '(', 'VP', '(', 'MD', ')', '(', 'VP', '(', 'VP', '(', 'VV', ')', '(', 'NP', '(', 'NN', ')', '(', 'SFN', ')', ')', ')', '(', ')', '(', 'VP', '(', 'VV', ')', '(', 'NP', '(', 'DP', '(', 'DT', ')', ')', '(', 'NP', '(', 'NNP', ')', ')', '(', 'NP', '(', 'NN', ')', ')', ')', ')', ')', ')', ')', '(', '.', ')', ')', ')']
{"IP": {"IP": {}, ")": {}}}


In [51]:
print(extract_dependency_feature_from_list(m))

['NOUN/NUM/nummod', 'VERB/NOUN/nsubj', 'VERB/ADV/mark', 'VERB/ADV/advmod', 'VERB/VERB/advcl', 'VERB/ADV/mark', 'None/VERB/root', 'VERB/AUX/aux', 'VERB/NOUN/nmod:tmod', 'VERB/VERB/xcomp', 'NOUN/NUM/nummod', 'VERB/NOUN/obj', 'VERB/PUNCT/punct']


In [56]:
for zh, en in parse_tmx('./Yiyan_tmx/A01A.tmx'):
    print(zh, en)

Secretary Clinton's Asia Trip: Allied Reassurance 希拉里克林顿的亚洲之行：放心吧，盟友
Coming only three weeks into the Obama Administration, Secretary of State Hillary Clinton's Asia trip will be long on signals and short on substance. 进入奥巴马内阁仅仅三周，国务卿希拉里克林顿的亚洲之行信号多于实质。
That is not necessarily a bad thing, especially when it sends several critically important messages to allies Japan and South Korea. 这并不是坏事，特别是访问向美国的盟友日本和韩国发出了几个极为重要的信息。
Her trip communicates that Asia matters to the United States and that Washington is committed to a predominant role in the region over the long-term. 她的访问告诉大家亚洲事务对于美国来说很重要，华盛顿致力于长期在这个区域内扮演一个有影响力的角色。
Traveling to Tokyo and Seoul prior to Beijing reflects the importance of our allies as well as a direct attempt to assuage fears of "Japan passing." 在访问北京之前访问东京和首尔反映出了我们的盟友的重要性以及想要缓解“日本已经过气了”的担心的直接尝试。
As a Senator, Hillary Clinton authored a Foreign Affairs article in which she stated the U.S.--China relationship was the most important relationship in Asia, rekindling Japanes