In [184]:
import csv
from sqlite3 import enable_callback_tracebacks

import stanza
from spacy.training.example import Alignment

from transformers import BertTokenizer

# Load a Chinese BERT tokenizer (you can use other models like RoBERTa or ERNIE)
ch_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

class SentencePair:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh

# Initialize Stanza for Chinese word segmentation
stanza.download('zh')  # Ensure the Chinese model is downloaded
nlp = stanza.Pipeline('zh', processors='tokenize')

rows = []
def load_parallel_sentences(file_path):
    sentence_pairs = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        # Loop through each row in the CSV file
        for row in reader:
            if len(row) < 7:
                continue
            english_content = row[5]  # Adjust based on the correct column index
            chinese_content = row[6]  # Adjust based on the correct column index

            # Split the content into sentences based on '@' delimiters
            english_sentences = english_content.split('@')
            chinese_sentences = chinese_content.split('@')
            
            for eng_sent, chi_sent in zip(english_sentences, chinese_sentences):
                # Process the Chinese sentence with Stanza
                doc = nlp(chi_sent)  # Apply the Stanza pipeline
                ch_tokens = [word.text for sentence in doc.sentences for word in sentence.words]
                            
                # Join the words with spaces
                ch_sentence = " ".join(ch_tokens)
                
                sentence_pairs.append(SentencePair(eng_sent.strip(), ch_sentence.strip()))  
            
    
    return sentence_pairs


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 13.6MB/s]                    
2024-11-24 20:45:29 INFO: Downloaded file to /Users/vnnsnnt/stanza_resources/resources.json
2024-11-24 20:45:29 INFO: "zh" is an alias for "zh-hans"
2024-11-24 20:45:29 INFO: Downloading default packages for language: zh-hans (Simplified_Chinese) ...
2024-11-24 20:45:30 INFO: File exists: /Users/vnnsnnt/stanza_resources/zh-hans/default.zip
2024-11-24 20:45:33 INFO: Finished downloading models and saved to /Users/vnnsnnt/stanza_resources
2024-11-24 20:45:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 18.1MB/s]                    
2024-11-24 20:45:33 INFO: Downloaded file to /Users/v

In [185]:
import os

all_pairs = []
directory_path = 'FTIE'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    # Check if the path is a file and ends with .csv
    if os.path.isfile(file_path) and filename.endswith('.csv'):
        # Load pairs from the current file and extend the all_pairs list
        file_pairs = load_parallel_sentences(file_path)
        all_pairs.extend(file_pairs)


In [190]:
print(all_pairs[40].en)
print(all_pairs[40].zh)

Since the dawn of time, technology has constantly outpaced the law – but never quite as fast as in the digital age. Some of the most profitable and popular places on the internet today – from MySpace, to YouTube and beyond – exist in legal limbo. Every day, MySpacers and YouTubers commit millions of arguably unlawful acts: for when they are not posting riveting cellphone footage of the baby or the cat or the milk train passing, they are lip-syncing to the copyrighted songs of hot young starlets, or posting top-selling albums for free. Much of what the <em><i>digerati</i></em> do online is probably illegal: since the dawn of internet time, it was ever thus.
自古 以来 ， 科技 的 发展 速度 一直 快 于 法律 ， 但 从来 没有 像 数码 时代 这样 差距 悬殊 。 如今 ， 互联 网上 赢利 最高 、 人气 最旺 的 一些 场所 — — 从 My Space 到 You Tube 到 其它 各色 网站 ， 都 生存 在 法律 的 空白 地带 。 MySpace 和 You Tube 用户 每天 的 “ 非法 行为 ” ， 可以 说 数 以 百万 计 ： 除 了 在 网上 发布 用 手机 拍摄 的 有关 婴儿 、 猫咪 或 送奶 卡车 的 视频 之外 ， 他们 还 经常 对 口型 假唱 当红 新 星 们 拥有 版权 的 歌曲 ， 或是 免费 发布 一些 畅销 专辑 。 这些 数码 玩家 的 许多 网络 行为 ，

In [9]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)

# Set the source and target languages
tokenizer.src_lang = "zh_CN"  # Use 'zh_CN' for Simplified Chinese
tokenizer.tgt_lang = "en_XX"  # 'en_XX' for English

# Sample Chinese sentence
with open("unmodified_zh-en-translated_sentences.txt", "w", encoding="utf-8") as f:
    print("Translating", len(all_pairs), "sentences")
    for index, pair in enumerate(all_pairs):
        chinese_sentence = pair.zh  # source
        english_sentence = pair.en  # target
        
        # Tokenize the input text
        inputs = tokenizer(chinese_sentence, return_tensors="pt")
        # Generate translation
        translated_tokens = model.generate(**inputs)
        # Decode the translated tokens
        translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        # Save the result in the text file
        f.write(f"{chinese_sentence}; {english_sentence}; {translated_sentence}\n")
        
        if index % 100 == 0: 
            print("Done with", index, "/", len(all_pairs))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Translating 255860 sentences
Done with 0 / 255860
Done with 100 / 255860


KeyboardInterrupt: 

In [29]:
import re

# Define the file path
file_path = 'alignments.txt'

# Initialize a list to store (en, zh) pairs
alignment_pairs = []

class AlignmentPair: 
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
# Open and process the file
with open(file_path, 'r', encoding='utf-8') as file:
    i = 0
    for line in file:
    
        # Split the line into segments by spcae
        segments = line.strip().split(' ')
      
        for segment in segments:
            # Split the segment by '<sep>'
            tokens = segment.split('<sep>')
            clean_en = re.sub(r'[^\w\s]', '', tokens[0]).strip()
            clean_zh = re.sub(r'[^\w\s]', '', tokens[1]).strip()
            alignment_pairs.append(AlignmentPair(clean_en, clean_zh))
        

In [32]:
suitable_alignment_pairs = []
english_pattern = re.compile(r'^[A-Za-z0-9\s.,?!\'"-]+$')  # Matches English letters, digits, spaces, and common punctuation
chinese_pattern = re.compile(r'^[\u4e00-\u9fff]+$')         # Matches Chinese characters (CJK Unified Ideographs)

for pair in alignment_pairs:
    if pair.en == pair.zh:
        continue
    if pair.en == "":
        continue
    if pair.zh == "":
        continue

    # Check if pair.en is all English and pair.zh is all Chinese
    if not english_pattern.match(pair.en.strip()):  # Ensure `en` is all English
        continue
    if not chinese_pattern.match(pair.zh.strip()):  # Ensure `zh` is all Chinese
        continue

    suitable_alignment_pairs.append(pair)

In [35]:
for suitable_alignment_pair in suitable_alignment_pairs[0:100]:
    print(suitable_alignment_pair.en, suitable_alignment_pair.zh)

commodity 商品
trader 交易商
fighting 应对
allegations 指控
aggressive 出格
accounting 会计
accounting 操作
has 已
raised 出售
750m 亿美元
the 的
sale 出售
its 其
stake 股份
in 在
agricultural 农业
joint 合资企业
venture 合资企业
with 与
Chinas 中国
statebacked 国有
statebacked 合资企业
grain 粮食
trader 贸易商
The 的
Hong 香港
Kongbased 香港
Kongbased 位于
company 集团
has 近来
scrambling 忙于
to 忙于
raise 筹集资金
cash 筹集资金
to 以免
avoid 以免
losing 失去
its 其
investment 投资
grade 级
credit 信用
rating 评级
which 这种
crucial 至关重要
to 对于
the 的
profitability 盈利
profitability 能力
of 对于
its 其
core 核心
business 业务
moving 从事
moving 贸易
millions 大量
tonnes 大量
of 的
raw 原材料
materials 原材料
around 世界各地
world 世界各地
Noble 来宝
which 的
listed 上市
in 在
Singapore 新加坡
until 最迟在
early 明年初
next 明年初
year 明年初
by 面对
rating 评级
agencies 机构
and 和
raise 筹集
at 至少
least 至少
500m 亿美元
or 否则
being 被
cut 下调
to 至
junk 垃圾
status 级
To 在
his 的
many 诸多
achievements 成就
film 电影
producer 制片人
philanthropist 慈善家
serial 连环
sex 性
offender 侵者
can 可以
add 加上
failed 失败
lobbyist 游说
lobbyist 者
for 获得


In [203]:
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Get the set of stopwords and punctuation
stop_words = set(stopwords.words('english')) 
stop_words.update(
    {'cent', 'href=', 'http', 'says', 'told', 'year', 'ago', 'yesterday', 'since', 'last', 'past', 'next',
     'said', 'almost', 'within', 'would', 'nearly', 'years', 'months', 'according', 'compared', 'go', 'also', 
     "n't"})  # Using update to add multiple elements

punctuation_set = set(punctuation)
punctuation_set.update({"’", "’", '”', "''", "“", "'s", '--', 'b', '/b', '/strong', '–', '—'})  # Adding custom punctuation symbols

def extract_ngram_counts(sentence_pairs, n=2):
    ngram_counts = Counter()

    # Iterate through each sentence pair
    for pair in sentence_pairs:
        # Tokenize the English sentence (p.en)
        tokens = nltk.word_tokenize(pair.en)

        # Filter out stopwords, punctuation, and numbers
        filtered_tokens = [token.lower() for token in tokens 
                           if token.lower() not in stop_words 
                           and token not in punctuation_set 
                           and not token.isdigit()]  # Exclude numeric tokens

        # Generate n-grams for the filtered tokens
        ngram_list = ngrams(filtered_tokens, n)

        # Count the frequency of each n-gram
        ngram_counts.update(ngram_list)

    return ngram_counts

# extract bigrams, trigrams, and 4-grams
bigram = extract_ngram_counts(all_pairs, 2)     # 10k
trigram = extract_ngram_counts(all_pairs, 3)    # 5k
quad_gram = extract_ngram_counts(all_pairs, 4)  # 3k
# pent_gram = extract_ngram_counts(all_pairs, 5)  # 1k

[nltk_data] Downloading package punkt to /Users/vnnsnnt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vnnsnnt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [204]:
bigrams_to_consider = bigram.most_common()[0:5000]
trigrams_to_consider = trigram.most_common()[:1000]
quad_grams_to_consider = quad_gram.most_common()[:3000]

In [193]:
multi_word_term = '_'.join(quad_grams_to_consider[0][0])
print(multi_word_term)

per_gross_domestic_product


In [205]:
# Initialize a set for multi-word terms
multi_grams_to_consider = set()

# Add multi-word terms from quad_grams_to_consider
for quad_gram in quad_grams_to_consider:
    multi_word_term = '_'.join(quad_gram[0])
    multi_grams_to_consider.add(multi_word_term)

# Add multi-word terms from trigrams_to_consider
for trigram in trigrams_to_consider:
    multi_word_term = '_'.join(trigram[0])
    multi_grams_to_consider.add(multi_word_term)

# Add multi-word terms from bigrams_to_consider
for bigram in bigrams_to_consider:
    multi_word_term = '_'.join(bigram[0])
    multi_grams_to_consider.add(multi_word_term)

In [207]:
len(multi_grams_to_consider)

9000

In [208]:

def replace_with_multi_word_terms(sentence, multi_grams_to_consider):
    # Tokenize the sentence into words
    words = sentence.split(' ')
    
    # Prepare a list to hold the modified sentence
    modified_sentence = []
    i = 0
    while i < len(words):
        found = False
        
        # Check for quadgrams (4-word sequences)
        for length in range(4, 1, -1):  # Check for quadgram to bigram
            if i + length <= len(words):
                multi_word_candidate = '_'.join(words[i:i+length]).lower()
                if multi_word_candidate in multi_grams_to_consider:
                    # If a match is found, replace the words with the multi-word term
                    modified_sentence.append(multi_word_candidate)
                    i += length
                    found = True
                    break
        
        if not found:
            # If no match is found, just add the word as is
            modified_sentence.append(words[i])
            i += 1

    # Return the modified sentence as a string
    return ' '.join(modified_sentence)

In [209]:
# Assuming all_pairs contains your sentence pairs and you have the function replace_with_multi_word_terms defined
with open("zhen.src-tgt", "w") as f:
    for pair in all_pairs:
        # Modify the sentence using your function
        modified_sentence = replace_with_multi_word_terms(pair.en, multi_grams_to_consider)
        
        # Save the modified sentence along with the corresponding Chinese sentence
        f.write(f"{modified_sentence} ||| {pair.zh}\n")

print("Modified sentences saved to 'modified_sentences.txt'")

Modified sentences saved to 'modified_sentences.txt'


In [None]:
class TermAlignment:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
# Function to clean text to retain only alphabets

def clean_text(text):
    return re.sub(r'[^a-zA-Z_]', '', text)

multiword_alignments = []
# Process the alignments.txt file
with open('alignments.txt', 'r', encoding='utf-8') as file:
    for line in file:
        alignment_pairs = line.strip().split(' ')
        for index, pair in enumerate(alignment_pairs):
            en_entry, zh_entry = pair.split('<sep>')[0], pair.split('<sep>')[1]
            if en_entry not in multi_grams_to_consider: continue
            # Clean the English entry
            cleaned_en_entry = clean_text(en_entry)

            # Append only if conditions are met
            if cleaned_en_entry:
                if multiword_alignments and multiword_alignments[len(multiword_alignments)-1].en == cleaned_en_entry:
                    if zh_entry not in multiword_alignments[len(multiword_alignments)-1].zh:
                        multiword_alignments[len(multiword_alignments)-1].zh += zh_entry
                else:
                    multiword_alignments.append(TermAlignment(cleaned_en_entry, zh_entry))

In [228]:
unique_alignments = set(TermAlignment(alignment.en, alignment.zh) for alignment in multiword_alignments)

In [231]:
from collections import defaultdict, Counter

# Step 1: Count frequencies of `zh` entries for each `en`
alignment_freq = defaultdict(Counter)

for alignment in unique_alignments:
    alignment_freq[alignment.en][alignment.zh] += 1

# Step 2: Select the most frequent `zh` entry for each `en`
simplified_alignments = []
for en, zh_counter in alignment_freq.items():
    most_frequent_zh = zh_counter.most_common(1)[0][0]  # Get the most frequent `zh`
    simplified_alignments.append(TermAlignment(en, most_frequent_zh))

# Step 3: Sort alphabetically by `en`
sorted_simplified_alignments = sorted(simplified_alignments, key=lambda alignment: alignment.en)

# Step 4: Write to file
with open('simplified-alignments.txt', 'w') as file:
    for alignment in sorted_simplified_alignments:
        file.write(f"{alignment.en} {alignment.zh}\n")