In [1]:
import csv
from lib2to3.btm_utils import tokens

import stanza
import os
import nltk
from awesome_align.run_train import train
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
import re 
from collections import defaultdict, Counter

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('zh') 
nlp = stanza.Pipeline('zh', processors='tokenize')

# Get the set of stopwords and punctuation
stop_words = set(stopwords.words('english')) 
stop_words.update(
    {'cent', 'href=', 'http', 'says', 'told', 'year', 'ago', 'yesterday', 'since', 'last', 'past', 'next',
     'said', 'almost', 'within', 'would', 'nearly', 'years', 'months', 'according', 'compared', 'go', 'also', 
     "n't"})  
punctuation_set = set(punctuation)
punctuation_set.update({"’", "’", '”', "''", "“", "'s", '--', 'b', '/b', '/strong', '–', '—'})

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/vnnsnnt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vnnsnnt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 75.1MB/s]                    
2024-12-01 05:11:11 INFO: Downloaded file to /Users/vnnsnnt/stanza_resources/resources.json
2024-12-01 05:11:11 INFO: "zh" is an alias for "zh-hans"
2024-12-01 05:11:11 INFO: Downloading default packages for language: zh-hans (Simplified_Chinese) ...
2024-12-01 05:11:12 INFO: File exists: /Users/vnnsnnt/stanza_resources/zh-hans/default.zip
2024-12-01 05:11:14 INFO: Finished downloading models and saved to /Users/vnnsnnt/stanza_resources
2024-12-01 05:11:14 INFO: Checking for updates to resources.json in case models have been updated. 

In [2]:
# Data Structures
class ParallelSentence: 
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class AnchorWord:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class ParallelCorpus: 
    def __init__(self):
        self.parallel_sentences = []
        self.multi_grams_to_consider = []
        self.anchor_words = {}
        
    def load_parallel_sentences(self, data_source):
        parallel_sentences = []
        for file in os.listdir(data_source):
            file_path = os.path.join(data_source, file)
            with open(file_path, mode='r', encoding='utf-8') as data_file:
                reader = csv.reader(data_file, delimiter=';')
                for row in reader:
                    if len(row) < 7: continue   # escape bad data
                    english_content = row[5]    # get english sentences
                    chinese_content = row[6]    # get chinese sentences
        
                    # break apart sentence content by @ delimiter
                    english_sentences = english_content.split('@')  
                    chinese_sentences = chinese_content.split('@')
                    
                    for english_sentence, chinese_sentence in zip(english_sentences, chinese_sentences):
                        clean_english_sentence = english_sentence.strip()
                        
                        # Process the Chinese sentence with Stanza
                        # doc = nlp(chinese_sentence)  
                        # chinese_tokens = [word.text for sentence in doc.sentences for word in sentence.words]
                        # clean_chinese_sentence = " ".join(chinese_tokens)
                        
                        parallel_sentences.append(ParallelSentence(clean_english_sentence, chinese_sentence))
                        
        self.parallel_sentences = parallel_sentences
    
    def generate_multi_grams(self):
        bigrams = self.extract_ngram_counts(n=2).most_common()[:5000]
        trigrams = self.extract_ngram_counts(n=3).most_common()[:3000]
        quadgrams = self.extract_ngram_counts(n=4).most_common()[:1000]
        
        multi_grams_to_consider = set()
        # Add multi-word terms from quad_grams_to_consider
        for quad_gram in quadgrams:
            multi_word_term = '_'.join(quad_gram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from trigrams_to_consider
        for trigram in trigrams:
            multi_word_term = '_'.join(trigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from bigrams_to_consider
        for bigram in bigrams:
            multi_word_term = '_'.join(bigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        self.multi_grams_to_consider = multi_grams_to_consider
        
    @staticmethod
    def refactor_sentence_with_multiword_term(sentence, multi_word_terms):
        # Tokenize the sentence into words
        words = sentence.split(' ')
        modified_sentence = []
        i = 0
        while i < len(words):
            found = False
            
            # Check for quadgrams (4-word sequences)
            for length in range(4, 1, -1):  # Check for quadgram to bigram
                if i + length <= len(words):
                    multi_word_candidate = '_'.join(words[i:i+length]).lower()
                    if multi_word_candidate in multi_word_terms:
                        # If a match is found, replace the words with the multi-word term
                        modified_sentence.append(multi_word_candidate)
                        i += length
                        found = True
                        break
            
            if not found:
                # If no match is found, just add the word as is
                modified_sentence.append(words[i])
                i += 1

        # Return the modified sentence as a string
        return ' '.join(modified_sentence)
    
    def extract_ngram_counts(self, n):
        ngram_counts = Counter()
        for parallel_sentence in self.parallel_sentences:
            tokens = nltk.word_tokenize(parallel_sentence.en)
            # Filter out stopwords, punctuation, and numbers
            filtered_tokens = [token.lower() for token in tokens 
                               if token.lower() not in stop_words 
                               and token not in punctuation_set 
                               and not token.isdigit()] 
    
            # Generate n-grams for the filtered tokens
            ngram_list = ngrams(filtered_tokens, n)
            # Count the frequency of each n-gram
            ngram_counts.update(ngram_list)
        return ngram_counts
    
    def format_parallel_sentences_for_awesome_align(self):
        with open("zhen.src-tgt", "w") as f:
            for parallel_sentence in self.parallel_sentences:
                modified_sentence = self.refactor_sentence_with_multiword_term(parallel_sentence.en, self.multi_grams_to_consider)
                f.write(f"{modified_sentence} ||| {parallel_sentence.zh}\n")
    
    def build_anchor_words_from_awesome_align_output(self, alignments_path):
        anchor_words = []
        with open(alignments_path, 'r', encoding='utf-8') as file:
            for line in file:
                alignment_pairs = line.strip().split(' ')
                for index, pair in enumerate(alignment_pairs):
                    en_entry, zh_entry = pair.split('<sep>')[0], pair.split('<sep>')[1]
                    if en_entry not in self.multi_grams_to_consider: continue
                    # Clean the English entry
                    cleaned_en_entry = re.sub(r'[^a-zA-Z_]', '', en_entry)
                    
                    # Append only if conditions are met
                    if cleaned_en_entry:
                        if anchor_words and anchor_words[len(anchor_words)-1].en == cleaned_en_entry:
                            if zh_entry not in anchor_words[len(anchor_words)-1].zh:
                                anchor_words[len(anchor_words)-1].zh += zh_entry
                        else:
                            anchor_words.append(AnchorWord(cleaned_en_entry, zh_entry))
                            
        unique_anchors = set(AnchorWord(anchor.en, anchor.zh) for anchor in anchor_words)
        
        # Step 1: Count frequencies of `zh` entries for each `en`
        anchor_freq = defaultdict(Counter)
    
        for anchor in unique_anchors:
            anchor_freq[anchor.en][anchor.zh] += 1
        
        # Step 2: Select the most frequent `zh` entry for each `en`
        filtered_alignments = []
        for en, zh_counter in anchor_freq.items():
            most_frequent_zh = zh_counter.most_common(1)[0][0]  # Get the most frequent `zh`
            filtered_alignments.append(AnchorWord(en, most_frequent_zh))
        
        # Step 3: Sort alphabetically by `en`
        sorted_filtered_anchors = sorted(filtered_alignments, key=lambda anchor: anchor.en)
        
        # Step 4: Write to file
        with open('possible-anchors.txt', 'w') as file:
            for alignment in sorted_filtered_anchors:
                file.write(f"{alignment.en} {alignment.zh}\n")
    
    def load_sorted_anchors(self, anchor_path):
        anchors = set()
        with open(anchor_path, 'r') as file:
            for line in file: 
                alignment = line.strip().split(' ')
                en = alignment[0].replace('_', ' ')
                zh = alignment[1] 
                anchors.add((en, zh))  # Store as a tuple for paired lookup
        self.anchor_words = anchors

In [3]:
parallel_corpus = ParallelCorpus()  # Initialize Corpus Object

In [4]:
parallel_corpus.load_parallel_sentences(data_source='./FTIE/')  # Load parallel sentences from data source

In [26]:
parallel_corpus.generate_multi_grams()  # Generate Multi grams e.g Asian Financial Crisis -> asian_financial_crisis

In [27]:
bigrams = parallel_corpus.extract_ngram_counts(n=2).most_common()[:5000]

In [30]:
trigrams = parallel_corpus.extract_ngram_counts(n=3).most_common()[:3000]

[(('gross', 'domestic', 'product'), 1713), (('international', 'monetary', 'fund'), 1222), (('president', 'donald', 'trump'), 1017), (('european', 'central', 'bank'), 928), (('initial', 'public', 'offering'), 814), (('president', 'xi', 'jinping'), 729), (('us', 'federal', 'reserve'), 671), (('president', 'barack', 'obama'), 664), (('people', 'bank', 'china'), 652), (('global', 'financial', 'crisis'), 637), (('per', 'gross', 'domestic'), 477), (('foreign', 'exchange', 'reserves'), 477), (('us', 'president', 'donald'), 460), (('south', 'china', 'sea'), 459), (('second', 'world', 'war'), 448), (('world', 'trade', 'organisation'), 448), (('us', 'treasury', 'secretary'), 430), (('kim', 'jong', 'un'), 377), (('people', 'familiar', 'matter'), 365), (('george', 'w.', 'bush'), 362), (('interview', 'financial', 'times'), 361), (('sovereign', 'wealth', 'funds'), 360), (('initial', 'public', 'offerings'), 358), (('sovereign', 'wealth', 'fund'), 341), (('china', 'central', 'bank'), 340), (('world', 

In [31]:
trigrams = parallel_corpus.extract_ngram_counts(n=3).most_common()[:3000]
print(trigrams)

[(('gross', 'domestic', 'product'), 1713),
 (('international', 'monetary', 'fund'), 1222),
 (('president', 'donald', 'trump'), 1017),
 (('european', 'central', 'bank'), 928),
 (('initial', 'public', 'offering'), 814),
 (('president', 'xi', 'jinping'), 729),
 (('us', 'federal', 'reserve'), 671),
 (('president', 'barack', 'obama'), 664),
 (('people', 'bank', 'china'), 652),
 (('global', 'financial', 'crisis'), 637),
 (('per', 'gross', 'domestic'), 477),
 (('foreign', 'exchange', 'reserves'), 477),
 (('us', 'president', 'donald'), 460),
 (('south', 'china', 'sea'), 459),
 (('second', 'world', 'war'), 448),
 (('world', 'trade', 'organisation'), 448),
 (('us', 'treasury', 'secretary'), 430),
 (('kim', 'jong', 'un'), 377),
 (('people', 'familiar', 'matter'), 365),
 (('george', 'w.', 'bush'), 362),
 (('interview', 'financial', 'times'), 361),
 (('sovereign', 'wealth', 'funds'), 360),
 (('initial', 'public', 'offerings'), 358),
 (('sovereign', 'wealth', 'fund'), 341),
 (('china', 'central', 'b

In [32]:
quadgrams = parallel_corpus.extract_ngram_counts(n=4).most_common()[:1000]

In [33]:
quadgrams = parallel_corpus.extract_ngram_counts(n=4).most_common()[:1000]
quadgrams

[(('per', 'gross', 'domestic', 'product'), 474),
 (('us', 'president', 'donald', 'trump'), 459),
 (('industrial', 'commercial', 'bank', 'china'), 263),
 (('hong', 'kong', 'stock', 'exchange'), 223),
 (('organisation', 'economic', 'co-operation', 'development'), 186),
 (('president', 'george', 'w.', 'bush'), 171),
 (('new', 'york', 'stock', 'exchange'), 170),
 (('chinese', 'president', 'xi', 'jinping'), 169),
 (('gross', 'domestic', 'product', 'growth'), 167),
 (('brazil', 'russia', 'india', 'china'), 150),
 (('national', 'development', 'reform', 'commission'), 150),
 (('china', 'securities', 'regulatory', 'commission'), 150),
 (('prime', 'minister', 'shinzo', 'abe'), 143),
 (('leader', 'kim', 'jong', 'un'), 132),
 (('angela', 'merkel', 'german', 'chancellor'), 127),
 (('world', 'economic', 'forum', 'davos'), 122),
 (('us', 'securities', 'exchange', 'commission'), 121),
 (('gross', 'domestic', 'product', 'per'), 119),
 (('barack', 'obama', 'us', 'president'), 118),
 (('official', 'xinhu

In [34]:
pentgrams = parallel_corpus.extract_ngram_counts(n=5).most_common()[:1000]

In [35]:
pentgrams = parallel_corpus.extract_ngram_counts(n=5).most_common()[:1000]
pentgrams

[(('hank', 'paulson', 'us', 'treasury', 'secretary'), 104),
 (('north', 'korean', 'leader', 'kim', 'jong'), 83),
 (('korean', 'leader', 'kim', 'jong', 'un'), 82),
 (('earnings', 'interest', 'tax', 'depreciation', 'amortisation'), 80),
 (('tim', 'geithner', 'us', 'treasury', 'secretary'), 73),
 (('managing', 'director', 'international', 'monetary', 'fund'), 56),
 (('robert', 'lighthizer', 'us', 'trade', 'representative'), 55),
 (('people', 'bank', 'china', 'central', 'bank'), 54),
 (('hong', 'kong', 'hang', 'seng', 'index'), 54),
 (('initial', 'public', 'offering', 'hong', 'kong'), 51),
 (('international', 'monetary', 'fund', 'world', 'bank'), 47),
 (('north', 'american', 'free', 'trade', 'agreement'), 46),
 (('steven', 'mnuchin', 'us', 'treasury', 'secretary'), 45),
 (('finance', 'ministers', 'central', 'bank', 'governors'), 44),
 (('ben', 'bernanke', 'federal', 'reserve', 'chairman'), 43),
 (('crown', 'prince', 'mohammed', 'bin', 'salman'), 41),
 (('shinzo', 'abe', 'japan', 'prime', '

In [None]:
parallel_corpus.format_parallel_sentences_for_awesome_align() # Format English Sentence With Multi Grams 
# Prepare a data source for awesome align 

# DATA_FILE=./zhen.src-tgt
# MODEL_NAME_OR_PATH=./model_without_co
# OUTPUT_FILE=./output.txt
# OUTPUT_WORDS=./alignments.txt
# OUTPUT_PROB=./alignments-prob.txt
# 
# CUDA_VISIBLE_DEVICES=0 awesome-align \
#     --output_file=$OUTPUT_FILE \
#     --model_name_or_path=$MODEL_NAME_OR_PATH \
#     --data_file=$DATA_FILE \
#     --extraction 'softmax' \
#     --batch_size 32 \
#     --num_workers 0 \
#     --output_word_file=$OUTPUT_WORDS \
#     --output_prob_file=$OUTPUT_PROB 

In [None]:
parallel_corpus.build_anchor_words_from_awesome_align_output('./alignments.txt')    # Generate Possible Anchor Words

In [None]:
parallel_corpus.load_sorted_anchors('./final_anchors.txt')  # Load Final and Verified Anchors

In [None]:
# # How would the original model translate these anchor words? 
# def translate_anchor_words(src_lang, tgt_lang, output_file):
#     # Set the source and target languages
#     tokenizer.src_lang = src_lang
#     tokenizer.tgt_lang = tgt_lang
#     forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]  # Ensure the target language is correct
# 
#     # Translate and save results
#     with open(output_file, "w", encoding="utf-8") as f:
#         for index, pair in enumerate(parallel_corpus.anchor_words):
#             # Select source and target based on direction
#             source_anchor = pair.zh if src_lang == "zh_CN" else pair.en
#             target_anchor = pair.en if src_lang == "zh_CN" else pair.zh
# 
#             # Tokenize the input text
#             inputs = tokenizer(source_anchor, return_tensors="pt")
#             # Generate translation with forced BOS token for the target language
#             translated_tokens = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
#             # Decode the translated tokens
#             translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
# 
#             # Save the result in the text file
#             f.write(f"{source_anchor}; {target_anchor}; {translation.lower()}\n")
# 
#             if index % 100 == 0:
#                 print(f"Done translating {index} / {len(parallel_corpus.anchor_words)}")
# 


In [None]:
# # Translate English to Chinese
# translate_anchor_words(
#     src_lang="en_XX",
#     tgt_lang="zh_CN",
#     output_file="unmodified_en-zh-translated_anchor_words.txt"
# )

In [None]:
# translate_anchor_words(
#     src_lang="zh_CN",
#     tgt_lang="en_XX",
#     output_file="unmodified_zh-en-translated_anchor_words.txt"
# )

In [10]:
import Levenshtein
anchor_count = 0
perfect_match_count = 0
matching_translations = 0
with open("unmodified_zh-en-translated_anchor_words.txt", "r", encoding="utf-8") as f:
    for line in f: 
        items = line.split(';')
        zh_anchor = items[0].strip()
        en_anchor = items[1].strip()
        translation = items[2].strip()

        if translation == en_anchor:
            perfect_match_count += 1
            matching_translations += 1
        anchor_count += 1

print("Unmodified Accuracy on Chinese Anchor Words (zh->en):", matching_translations / anchor_count)
print("Perfect Match Count:", perfect_match_count, "out of", anchor_count)

Unmodified Accuracy on Chinese Anchor Words (zh->en): 0.32424441524310116
Perfect Match Count: 987 out of 3044


In [None]:
# import Levenshtein
# anchor_count = len(parallel_corpus.anchor_words)
# perfect_match_count = 0
# matching_translations = 0
# with open("unmodified_en-zh-translated_anchor_words.txt", "r", encoding="utf-8") as f:
#     for line in f: 
#         items = line.split(';')
#         en_anchor = items[0].strip()
#         zh_anchor = items[1].strip()
#         translation = items[2].strip()
#             
#         if translation == zh_anchor:
#             perfect_match_count += 1
#             matching_translations += 1
# 
# print("Unmodified Accuracy on English Anchor Words (en->zh):", matching_translations / anchor_count)
# print("Perfect Match Count:", perfect_match_count, "out of", anchor_count)

In [None]:
anchor_words_dict = {en: zh for en, zh in parallel_corpus.anchor_words}

def refactor_sentence_with_anchors(en_sentence, chinese_sentence, anchor_words):
    # Tokenize the sentence into words
    words = en_sentence.split(' ')
    modified_sentence = []
    i = 0
    refactored_chinese_sentence = chinese_sentence.replace(' ', '')
    while i < len(words):
        found = False
        
        # Check for multi-word anchor terms in English
        for length in range(4, 1, -1):  # Check from 4 words (quadgram) to 2 words (bigram)
            if i + length <= len(words):
                multi_word_candidate = ' '.join(words[i:i+length]).lower()  # Make sure we match underscore-separated terms
                # Iterate over the anchor words and check the English part of the pair
                for en_term, zh_term in anchor_words:
                    if multi_word_candidate == en_term:
                        modified_sentence.append(f"<{multi_word_candidate.replace(' ', '_')}>")  # Replace with English term
                        i += length  # Skip the words that are part of the multi-word term
                        found = True
                        refactored_chinese_sentence = refactored_chinese_sentence.replace(anchor_words_dict[en_term], '<'+zh_term+'>')
                        
                        break
                if found:
                    break
        
        if not found:
            # If no multi-word term is found, just add the current word
            modified_sentence.append(words[i])
            i += 1
    
    # Return the modified sentence as a string
    return ' '.join(modified_sentence), refactored_chinese_sentence


# Example to refactor both English and Chinese sentences
refactored_parallel_sentences = []
for index, parallel_sentence in enumerate(parallel_corpus.parallel_sentences):
    # Refactor the English sentence with anchor words
    modified_english_sentence, modified_chinese_sentence = refactor_sentence_with_anchors(parallel_sentence.en, parallel_sentence.zh, parallel_corpus.anchor_words)

    # Append the refactored sentence pair to the list
    refactored_parallel_sentences.append(ParallelSentence(modified_english_sentence, modified_chinese_sentence))
    if index % 1000 == 0: 
        print("Done refactoring", index, "out of", len(parallel_corpus.parallel_sentences))

In [None]:
with open('refactored_parallel_sentences.txt', 'w', encoding='utf-8') as f:
    for pair in refactored_parallel_sentences:
        f.write(f"{pair.en} ; {pair.zh}\n")
        
print("Refactored sentences saved to 'refactored_parallel_sentences.txt'")

In [None]:
parallel_corpus = ParallelCorpus()
parallel_corpus.load_sorted_anchors('./final_anchors.txt')

In [None]:
import os
import csv

def save_translation_to_csv(ps, output_file):
    # Extract English and Chinese texts from the input object
    english_text = ps.en
    chinese_text = ps.zh

    # Create a list of dictionaries containing the data to be saved in the CSV file
    data = [{'zh': chinese_text, 'en': english_text}]

    # Define the CSV file headers
    fieldnames = ['zh', 'en']

    # Check if the CSV file already exists and is non-empty
    file_exists = os.path.isfile(output_file) and os.path.getsize(output_file) > 0

    # Write the data to the CSV file
    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header only if the file is newly created or empty
        if not file_exists:
            writer.writeheader()

        # Write the data row
        writer.writerows(data)


In [None]:
for parallel_sentence in refactored_parallel_sentences:
    save_translation_to_csv(parallel_sentence, 'training_data.csv')

In [7]:
tokens_to_be_added = []
with open('tokens_to_be_added.txt', 'r', encoding='utf-8') as f:
    for line in f:
        tokens_to_be_added.append(line.strip())

In [8]:
tokens_to_be_added

['<german_chancellor_angela>',
 '<德国总理安格拉•默克尔>',
 '<foreign_assets>',
 '<外国资产>',
 '<st_petersburg>',
 '<圣彼得堡>',
 '<interbank_lending_rates>',
 '<间拆借利率>',
 '<exiled_tibetan_spiritual_leader>',
 '<负责“利益”者>',
 '<economic_outlook>',
 '<经济前景>',
 '<coking_coal>',
 '<炼焦煤>',
 '<us_securities_exchange>',
 '<美国《SecuritiesExchange>',
 '<operating_profits>',
 '<营业利润>',
 '<chinese_telecoms_company_huawei>',
 '<中国电信设备商(华Huawei)>',
 '<share_price>',
 '<股价>',
 '<purchasing_managers_indices>',
 '<采购经理指数>',
 '<international_trade>',
 '<国际贸易>',
 '<dry_bulk>',
 '<干散货>',
 '<business_model>',
 '<商业模式>',
 '<market_interest_rates>',
 '<市场利率>',
 '<north_korean_supreme_leader>',
 '<朝鲜最高人>',
 '<silicon_valley>',
 '<硅谷>',
 '<business_education>',
 '<商业教育>',
 '<potential_buyers>',
 '<潜在买家>',
 '<federal_open_market>',
 '<联邦公开市场>',
 '<banking_crisis>',
 '<银行危机>',
 '<cut_us_interest_rates>',
 '<减息美联>',
 '<us_politics>',
 '<美国政治>',
 '<crown_prince_mohammed_bin>',
 '<王储穆罕默德•本bin>',
 '<uk_prime_minister_david>',
 '<英国首相

In [14]:
from datasets import concatenate_datasets, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

In [18]:
dataset = './training_data.csv'
dataframe = pd.read_csv(dataset)

subset_df = dataframe.sample(n=100000, random_state=42)
train_df, temp_df = train_test_split(subset_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.33, random_state=42)


train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

import re

# Strict regex to match self-contained one-liner HTML tags like <a href=>
allowed_tags = ['a', 'html', 'p', 'div', 'img']
html_pattern = re.compile(
    r'<\s*({})\s*[^<>]*?>'.format('|'.join(allowed_tags))
)

# Assuming the English sentences are in a column named 'en'
html_count = train_df['en'].apply(lambda x: bool(html_pattern.search(str(x)))).sum()

print(f"Number of English sentences with valid self-contained tags: {html_count}")


Number of English sentences with valid self-contained tags: 6263


In [16]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)

tokenizer.src_lang = "zh_CN"  # Source language (Chinese)
tokenizer.tgt_lang = "en_XX"  # Target language (English)

# Add custom tokens and resize model embeddings
# tokenizer.add_tokens(tokens_to_be_added)
# model.resize_token_embeddings(len(tokenizer))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [13]:
output_file = "unmodified_model_test_translations.txt"

# Open the file in write mode
with open(output_file, "w", encoding="utf-8") as f:
    print("Starting test_dataset unmodified model translations total examples =", len(test_dataset))
    for idx, example in enumerate(test_dataset):
        test_sentence = example['zh']
        inputs = tokenizer(test_sentence, return_tensors="pt", padding="longest", truncation=True)

        # Generate translation
        model.eval()
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=512,
            )

        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Write translation to file
        f.write(translated_text + "\n")
        
        if idx % 1000 == 0:
            print("Done with", idx, "out of", len(test_dataset))


Starting test_dataset unmodified model translations total examples = 9900
Done with 0 out of 9900


KeyboardInterrupt: 

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["zh"], max_length=512, padding="max_length",truncation=True)

    target_encodings = tokenizer(example_batch["en"], max_length=512, padding="max_length", truncation=True)

    return {"input_ids": input_encodings["input_ids"],
           "attention_mask": input_encodings["attention_mask"],
           "labels": target_encodings["input_ids"]}

train_dataset_tf = train_dataset_hf.map(convert_examples_to_features, batched=True, remove_columns=["zh","en"])
val_dataset_tf = validation_dataset_hf.map(convert_examples_to_features, batched=True, remove_columns=["zh","en"])

In [None]:
train_dataset_tf

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer,TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir='mbartTrans',
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy='steps',
    save_strategy='no',
    eval_steps=2000,
    logging_steps=1000,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=False,
    learning_rate=2e-5,
    optim="adafactor",
    no_cuda=True  # Forces CPU training
)


In [None]:
import gc
gc.collect()

In [None]:
trainer = Seq2SeqTrainer(model=model, args=training_args, tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=train_dataset_tf,
                        eval_dataset=val_dataset_tf)

trainer.train()

In [None]:
with open('cleaned_training_data.csv') as f: 
    line = f.readline()
    print(line)

In [18]:
# Input file names
base_file = "modified_model_test_translations.txt"
rl_file = "rl_model_test_translations.txt"
output_file = "final_rl_translations.txt"

# Process the files
with open(base_file, "r", encoding="utf-8") as base_f, open(rl_file, "r", encoding="utf-8") as rl_f:
    combined_translations = []
    
    # Process both files line by line
    for base_line, rl_line in zip(base_f, rl_f):
        # Split base and RL lines into sentence and translation
        base_sentence, base_translation = base_line.strip().split(" ; ", maxsplit=1)
        rl_sentence, rl_translation = rl_line.strip().split(";")

        
        # Use the RL translation unless it's empty
        final_translation = rl_translation.strip()
        if not final_translation:
            final_translation = base_translation.strip()
        
        # Append the combined line
        combined_translations.append(f"{rl_sentence} ; {final_translation}")

# Write to the output file
with open(output_file, "w", encoding="utf-8") as out_f:
    for line in combined_translations:
        out_f.write(line + "\n")

print(f"Processed translations saved to {output_file}")


Processed translations saved to final_rl_translations.txt


In [2]:
import re
import sacrebleu

# Function to clean the text: remove non-alphabetic characters (including punctuation, numbers)
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove all non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace to a single space
    return text.strip()

# Load the source-target translation pairs from the files
def load_translation_pairs(file_path):
    src_lines = []
    tgt_lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Skip empty lines or lines without a semicolon separator
            if line.strip() and ';' in line:
                src, tgt = line.strip().split(';', 1)  # Split only once on the first semicolon
                src_lines.append(src)
                tgt_lines.append(tgt)
            else:
                print(f"Skipping invalid line: {line.strip()}")
    return src_lines, tgt_lines

# Load unmodified and modified translations
unmodified_src, unmodified_tgt = load_translation_pairs('processed_unmodified_model_test_translations.txt')
modified_src, modified_tgt = load_translation_pairs('processed_modified_model_test_translations.txt')

# Load reference translations (English references)
def load_references(reference_file):
    references = []
    with open(reference_file, 'r', encoding='utf-8') as file:
        for line in file:
            references.append([line.strip()])  # BLEU expects each reference to be a list of sentences
    return references

# Load the reference translations from the file
references = load_references('reference_english_translations.txt')

# Clean the text (remove punctuation, normalize whitespace, and lowercase)
unmodified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in unmodified_tgt]
modified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in modified_tgt]
references_cleaned = [[clean_text(ref[0].lower())] for ref in references]  # Clean and lowercased references

# Calculate BLEU scores for unmodified and modified models (cleaned text)
unmodified_bleu = sacrebleu.corpus_bleu(unmodified_tgt_cleaned, references_cleaned)
modified_bleu = sacrebleu.corpus_bleu(modified_tgt_cleaned, references_cleaned)

# Print the BLEU scores
print(f'Unmodified model BLEU score (cleaned): {unmodified_bleu.score}')
print(f'Modified model BLEU score (cleaned): {modified_bleu.score}')

Unmodified model BLEU score (cleaned): 28.050892427639997
Modified model BLEU score (cleaned): 26.420512124185844


## Hypothesis Testing

In [1]:
import re
import sacrebleu
from scipy.stats import ttest_rel

# Function to clean the text: remove non-alphabetic characters (including punctuation, numbers)
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove all non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace to a single space
    return text.strip()

# Load the source-target translation pairs from the files
def load_translation_pairs(file_path):
    src_lines = []
    tgt_lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() and ';' in line:
                src, tgt = line.strip().split(';', 1)  # Split only once on the first semicolon
                src_lines.append(src)
                tgt_lines.append(tgt)
            else:
                print(f"Skipping invalid line: {line.strip()}")
    return src_lines, tgt_lines

# Load reference translations (English references)
def load_references(reference_file):
    references = []
    with open(reference_file, 'r', encoding='utf-8') as file:
        for line in file:
            references.append(line.strip())
    return references

# Function to compute sentence-level BLEU scores
def compute_sentence_bleu(hypotheses, references):
    bleu_scores = []
    for hyp, ref in zip(hypotheses, references):
        bleu = sacrebleu.sentence_bleu(hyp, [ref])
        bleu_scores.append(bleu.score)
    return bleu_scores

# Load translations and references
unmodified_src, unmodified_tgt = load_translation_pairs('processed_unmodified_model_test_translations.txt')
modified_src, modified_tgt = load_translation_pairs('processed_modified_model_test_translations.txt')
references = load_references('reference_english_translations.txt')

# Clean the text (remove punctuation, normalize whitespace, and lowercase)
unmodified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in unmodified_tgt]
modified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in modified_tgt]
references_cleaned = [clean_text(ref.lower()) for ref in references]

# Compute sentence-level BLEU scores
unmodified_sentence_bleu = compute_sentence_bleu(unmodified_tgt_cleaned, references_cleaned)
modified_sentence_bleu = compute_sentence_bleu(modified_tgt_cleaned, references_cleaned)

# Perform paired t-test
t_stat, p_value = ttest_rel(modified_sentence_bleu, unmodified_sentence_bleu)

# Print results
print(f"Paired t-test results for BLEU scores:")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("The difference in BLEU scores is statistically significant (p < 0.05).")
else:
    print("The difference in BLEU scores is not statistically significant (p >= 0.05).")

# Optional: Print average BLEU scores for reference
print(f"Mean BLEU score - Unmodified Model: {sum(unmodified_sentence_bleu)/len(unmodified_sentence_bleu):.2f}")
print(f"Mean BLEU score - Modified Model: {sum(modified_sentence_bleu)/len(modified_sentence_bleu):.2f}")


Paired t-test results for BLEU scores:
t-statistic: 51.5054, p-value: 0.0000
The difference in BLEU scores is statistically significant (p < 0.05).
Mean BLEU score - Unmodified Model: 14.96
Mean BLEU score - Modified Model: 20.01


In [3]:
import re
import sacrebleu
from scipy.stats import ttest_rel

# Function to clean the text: remove non-alphabetic characters (including punctuation, numbers)
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove all non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace to a single space
    return text.strip()

# Load the source-target translation pairs from the files
def load_translation_pairs(file_path):
    src_lines = []
    tgt_lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() and ';' in line:
                src, tgt = line.strip().split(';', 1)  # Split only once on the first semicolon
                src_lines.append(src)
                tgt_lines.append(tgt)
            else:
                print(f"Skipping invalid line: {line.strip()}")
    return src_lines, tgt_lines

# Load reference translations (English references)
def load_references(reference_file):
    references = []
    with open(reference_file, 'r', encoding='utf-8') as file:
        for line in file:
            references.append(line.strip())
    return references

# Function to compute sentence-level BLEU scores
def compute_sentence_bleu(hypotheses, references):
    bleu_scores = []
    for hyp, ref in zip(hypotheses, references):
        bleu = sacrebleu.sentence_bleu(hyp, [ref])
        bleu_scores.append(bleu.score)
    return bleu_scores

# Load translations and references
unmodified_src, unmodified_tgt = load_translation_pairs('processed_unmodified_model_test_translations.txt')
modified_src, modified_tgt = load_translation_pairs('final_rl_translations.txt')
references = load_references('reference_english_translations.txt')

# Clean the text (remove punctuation, normalize whitespace, and lowercase)
unmodified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in unmodified_tgt]
modified_tgt_cleaned = [clean_text(tgt.lower()) for tgt in modified_tgt]
references_cleaned = [clean_text(ref.lower()) for ref in references]

# Compute sentence-level BLEU scores
unmodified_sentence_bleu = compute_sentence_bleu(unmodified_tgt_cleaned, references_cleaned)
modified_sentence_bleu = compute_sentence_bleu(modified_tgt_cleaned, references_cleaned)

# Perform paired t-test
t_stat, p_value = ttest_rel(modified_sentence_bleu, unmodified_sentence_bleu)

# Print results
print(f"Paired t-test results for BLEU scores:")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("The difference in BLEU scores is statistically significant (p < 0.05).")
else:
    print("The difference in BLEU scores is not statistically significant (p >= 0.05).")

# Optional: Print average BLEU scores for reference
print(f"Mean BLEU score - Unmodified Model: {sum(unmodified_sentence_bleu)/len(unmodified_sentence_bleu):.2f}")
print(f"Mean BLEU score - Modified Model: {sum(modified_sentence_bleu)/len(modified_sentence_bleu):.2f}")


Paired t-test results for BLEU scores:
t-statistic: 37.7588, p-value: 0.0000
The difference in BLEU scores is statistically significant (p < 0.05).
Mean BLEU score - Unmodified Model: 14.96
Mean BLEU score - Modified Model: 18.95


In [14]:
import re
from collections import defaultdict
from scipy.stats import ttest_rel

# Load anchor translation dictionary
def load_anchor_translation_dict(anchor_file):
    anchor_translation_dict = defaultdict(list)
    with open(anchor_file, 'r', encoding='utf-8') as f:
        for line in f:
            items = line.split()
            anchor_word = items[1].strip()
            translation = items[0].strip()
            anchor_translation_dict[anchor_word].append(translation)
    return anchor_translation_dict

# Function to compute anchor word usage rate for individual sentences
def compute_anchor_usage_rate(file_path, anchor_translation_dict):
    sentence_usage_rates = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            # Split the line into original sentence and translated text using only the first semicolon
            parts = line.strip().split(" ; ", 1)
            if len(parts) == 2:
                test_sentence, translated_text = parts
            else:
                print(f"Skipping invalid line: {line.strip()}")
                continue
            
            # Extract anchor words from the original sentence
            anchor_words = re.findall(r"<([^>]+)>", test_sentence)
            
            # Extract anchor translations from the translated text
            anchor_translations = re.findall(r"<([^>]+)>", translated_text)
            
            total_anchor_words = len(anchor_words)
            correct_translations = 0
            
            # Check translations for each anchor word
            for anchor_word in anchor_words:
                for anchor_translation in anchor_translation_dict[anchor_word]:
                    if anchor_translation in anchor_translations:
                        correct_translations += 1
                        break
            
            # Calculate sentence-level usage rate
            if total_anchor_words > 0:
                usage_rate = correct_translations / total_anchor_words
            else:
                usage_rate = 0  # No anchor words in the sentence
            
            sentence_usage_rates.append(usage_rate)
    
    return sentence_usage_rates

# Load the anchor translation dictionary
anchor_translation_dict = load_anchor_translation_dict('final_anchors.txt')

# Compute anchor usage rates for unmodified and modified models
unmodified_usage_rates = compute_anchor_usage_rate('unmodified_model_test_translations_marked.txt', anchor_translation_dict)
modified_usage_rates = compute_anchor_usage_rate('modified_model_test_translations.txt', anchor_translation_dict)

# Perform paired t-test
t_stat, p_value = ttest_rel(modified_usage_rates, unmodified_usage_rates)

# Print results
print(f"Paired t-test results for anchor word usage rates:")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("The difference in anchor word usage rates is statistically significant (p < 0.05).")
else:
    print("The difference in anchor word usage rates is not statistically significant (p >= 0.05).")

# Optional: Print average usage rates for reference
print(f"Mean anchor usage rate - Unmodified Model: {sum(unmodified_usage_rates)/len(unmodified_usage_rates):.4f}")
print(f"Mean anchor usage rate - Modified Model: {sum(modified_usage_rates)/len(modified_usage_rates):.4f}")


Paired t-test results for anchor word usage rates:
t-statistic: 39.3776, p-value: 0.0000
The difference in anchor word usage rates is statistically significant (p < 0.05).
Mean anchor usage rate - Unmodified Model: 0.2125
Mean anchor usage rate - Modified Model: 0.3557


In [15]:
import re
from collections import defaultdict
from scipy.stats import ttest_rel

# Load anchor translation dictionary
def load_anchor_translation_dict(anchor_file):
    anchor_translation_dict = defaultdict(list)
    with open(anchor_file, 'r', encoding='utf-8') as f:
        for line in f:
            items = line.split()
            anchor_word = items[1].strip()
            translation = items[0].strip()
            anchor_translation_dict[anchor_word].append(translation)
    return anchor_translation_dict

# Function to compute anchor word usage rate for individual sentences
def compute_anchor_usage_rate(file_path, anchor_translation_dict):
    sentence_usage_rates = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            # Split the line into original sentence and translated text using only the first semicolon
            parts = line.strip().split(" ; ", 1)
            if len(parts) == 2:
                test_sentence, translated_text = parts
            else:
                print(f"Skipping invalid line: {line.strip()}")
                continue
            
            # Extract anchor words from the original sentence
            anchor_words = re.findall(r"<([^>]+)>", test_sentence)
            
            # Extract anchor translations from the translated text
            anchor_translations = re.findall(r"<([^>]+)>", translated_text)
            
            total_anchor_words = len(anchor_words)
            correct_translations = 0
            
            # Check translations for each anchor word
            for anchor_word in anchor_words:
                for anchor_translation in anchor_translation_dict[anchor_word]:
                    if anchor_translation in anchor_translations:
                        correct_translations += 1
                        break
            
            # Calculate sentence-level usage rate
            if total_anchor_words > 0:
                usage_rate = correct_translations / total_anchor_words
            else:
                usage_rate = 0  # No anchor words in the sentence
            
            sentence_usage_rates.append(usage_rate)
    
    return sentence_usage_rates

# Load the anchor translation dictionary
anchor_translation_dict = load_anchor_translation_dict('final_anchors.txt')

# Compute anchor usage rates for unmodified and modified models
unmodified_usage_rates = compute_anchor_usage_rate('unmodified_model_test_translations_marked.txt', anchor_translation_dict)
modified_usage_rates = compute_anchor_usage_rate('final_rl_translations.txt', anchor_translation_dict)

# Perform paired t-test
t_stat, p_value = ttest_rel(modified_usage_rates, unmodified_usage_rates)

# Print results
print(f"Paired t-test results for anchor word usage rates:")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("The difference in anchor word usage rates is statistically significant (p < 0.05).")
else:
    print("The difference in anchor word usage rates is not statistically significant (p >= 0.05).")

# Optional: Print average usage rates for reference
print(f"Mean anchor usage rate - Unmodified Model: {sum(unmodified_usage_rates)/len(unmodified_usage_rates):.4f}")
print(f"Mean anchor usage rate - Modified Model: {sum(modified_usage_rates)/len(modified_usage_rates):.4f}")


Paired t-test results for anchor word usage rates:
t-statistic: 20.0941, p-value: 0.0000
The difference in anchor word usage rates is statistically significant (p < 0.05).
Mean anchor usage rate - Unmodified Model: 0.2125
Mean anchor usage rate - Modified Model: 0.2883
