In [9]:
import csv

import stanza
import os
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
import re 
from collections import defaultdict, Counter

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('zh') 
nlp = stanza.Pipeline('zh', processors='tokenize')

# Get the set of stopwords and punctuation
stop_words = set(stopwords.words('english')) 
stop_words.update(
    {'cent', 'href=', 'http', 'says', 'told', 'year', 'ago', 'yesterday', 'since', 'last', 'past', 'next',
     'said', 'almost', 'within', 'would', 'nearly', 'years', 'months', 'according', 'compared', 'go', 'also', 
     "n't"})  
punctuation_set = set(punctuation)
punctuation_set.update({"’", "’", '”', "''", "“", "'s", '--', 'b', '/b', '/strong', '–', '—'})

ModuleNotFoundError: No module named 'stanza'

In [1]:
# Data Structures
class ParallelSentence: 
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class AnchorWord:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class ParallelCorpus: 
    def __init__(self):
        self.parallel_sentences = []
        self.multi_grams_to_consider = []
        self.anchor_words = {}
        
    def load_parallel_sentences(self, data_source):
        parallel_sentences = []
        for file in os.listdir(data_source):
            file_path = os.path.join(data_source, file)
            with open(file_path, mode='r', encoding='utf-8') as data_file:
                reader = csv.reader(data_file, delimiter=';')
                for row in reader:
                    if len(row) < 7: continue   # escape bad data
                    english_content = row[5]    # get english sentences
                    chinese_content = row[6]    # get chinese sentences
        
                    # break apart sentence content by @ delimiter
                    english_sentences = english_content.split('@')  
                    chinese_sentences = chinese_content.split('@')
                    
                    for english_sentence, chinese_sentence in zip(english_sentences, chinese_sentences):
                        clean_english_sentence = english_sentence.strip()
                        
                        # Process the Chinese sentence with Stanza
                        doc = nlp(chinese_sentence)  
                        chinese_tokens = [word.text for sentence in doc.sentences for word in sentence.words]
                        clean_chinese_sentence = " ".join(chinese_tokens)
                        
                        parallel_sentences.append(ParallelSentence(clean_english_sentence, clean_chinese_sentence))
                        
        self.parallel_sentences = parallel_sentences
    
    def generate_multi_grams(self):
        bigrams = self.extract_ngram_counts(n=2).most_common()[:5000]
        trigrams = self.extract_ngram_counts(n=3).most_common()[:3000]
        quadgrams = self.extract_ngram_counts(n=4).most_common()[:1000]
        
        multi_grams_to_consider = set()
        # Add multi-word terms from quad_grams_to_consider
        for quad_gram in quadgrams:
            multi_word_term = '_'.join(quad_gram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from trigrams_to_consider
        for trigram in trigrams:
            multi_word_term = '_'.join(trigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from bigrams_to_consider
        for bigram in bigrams:
            multi_word_term = '_'.join(bigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        self.multi_grams_to_consider = multi_grams_to_consider
        
    @staticmethod
    def refactor_sentence_with_multiword_term(sentence, multi_word_terms):
        # Tokenize the sentence into words
        words = sentence.split(' ')
        modified_sentence = []
        i = 0
        while i < len(words):
            found = False
            
            # Check for quadgrams (4-word sequences)
            for length in range(4, 1, -1):  # Check for quadgram to bigram
                if i + length <= len(words):
                    multi_word_candidate = '_'.join(words[i:i+length]).lower()
                    if multi_word_candidate in multi_word_terms:
                        # If a match is found, replace the words with the multi-word term
                        modified_sentence.append(multi_word_candidate)
                        i += length
                        found = True
                        break
            
            if not found:
                # If no match is found, just add the word as is
                modified_sentence.append(words[i])
                i += 1

        # Return the modified sentence as a string
        return ' '.join(modified_sentence)
    
    def extract_ngram_counts(self, n):
        ngram_counts = Counter()
        for parallel_sentence in self.parallel_sentences:
            tokens = nltk.word_tokenize(parallel_sentence.en)
            # Filter out stopwords, punctuation, and numbers
            filtered_tokens = [token.lower() for token in tokens 
                               if token.lower() not in stop_words 
                               and token not in punctuation_set 
                               and not token.isdigit()] 
    
            # Generate n-grams for the filtered tokens
            ngram_list = ngrams(filtered_tokens, n)
            # Count the frequency of each n-gram
            ngram_counts.update(ngram_list)
        return ngram_counts
    
    def format_parallel_sentences_for_awesome_align(self):
        with open("zhen.src-tgt", "w") as f:
            for parallel_sentence in self.parallel_sentences:
                modified_sentence = self.refactor_sentence_with_multiword_term(parallel_sentence.en, self.multi_grams_to_consider)
                f.write(f"{modified_sentence} ||| {parallel_sentence.zh}\n")
    
    def build_anchor_words_from_awesome_align_output(self, alignments_path):
        anchor_words = []
        with open(alignments_path, 'r', encoding='utf-8') as file:
            for line in file:
                alignment_pairs = line.strip().split(' ')
                for index, pair in enumerate(alignment_pairs):
                    en_entry, zh_entry = pair.split('<sep>')[0], pair.split('<sep>')[1]
                    if en_entry not in self.multi_grams_to_consider: continue
                    # Clean the English entry
                    cleaned_en_entry = re.sub(r'[^a-zA-Z_]', '', en_entry)
                    
                    # Append only if conditions are met
                    if cleaned_en_entry:
                        if anchor_words and anchor_words[len(anchor_words)-1].en == cleaned_en_entry:
                            if zh_entry not in anchor_words[len(anchor_words)-1].zh:
                                anchor_words[len(anchor_words)-1].zh += zh_entry
                        else:
                            anchor_words.append(AnchorWord(cleaned_en_entry, zh_entry))
                            
        unique_anchors = set(AnchorWord(anchor.en, anchor.zh) for anchor in anchor_words)
        
        # Step 1: Count frequencies of `zh` entries for each `en`
        anchor_freq = defaultdict(Counter)
    
        for anchor in unique_anchors:
            anchor_freq[anchor.en][anchor.zh] += 1
        
        # Step 2: Select the most frequent `zh` entry for each `en`
        filtered_alignments = []
        for en, zh_counter in anchor_freq.items():
            most_frequent_zh = zh_counter.most_common(1)[0][0]  # Get the most frequent `zh`
            filtered_alignments.append(AnchorWord(en, most_frequent_zh))
        
        # Step 3: Sort alphabetically by `en`
        sorted_filtered_anchors = sorted(filtered_alignments, key=lambda anchor: anchor.en)
        
        # Step 4: Write to file
        with open('possible-anchors.txt', 'w') as file:
            for alignment in sorted_filtered_anchors:
                file.write(f"{alignment.en} {alignment.zh}\n")
    
    def load_sorted_anchors(self, anchor_path):
        anchors = set()
        with open(anchor_path, 'r') as file:
            for line in file: 
                alignment = line.strip().split(' ')
                en = alignment[0].replace('_', ' ')
                zh = alignment[1] 
                anchors.add((en, zh))  # Store as a tuple for paired lookup
        self.anchor_words = anchors

In [None]:
parallel_corpus = ParallelCorpus()  # Initialize Corpus Object

In [None]:
parallel_corpus.load_parallel_sentences(data_source='./FTIE/')  # Load parallel sentences from data source

In [None]:
parallel_corpus.generate_multi_grams()  # Generate Multi grams e.g Asian Financial Crisis -> asian_financial_crisis

In [None]:
parallel_corpus.format_parallel_sentences_for_awesome_align() # Format English Sentence With Multi Grams 
# Prepare a data source for awesome align 

# DATA_FILE=./zhen.src-tgt
# MODEL_NAME_OR_PATH=./model_without_co
# OUTPUT_FILE=./output.txt
# OUTPUT_WORDS=./alignments.txt
# OUTPUT_PROB=./alignments-prob.txt
# 
# CUDA_VISIBLE_DEVICES=0 awesome-align \
#     --output_file=$OUTPUT_FILE \
#     --model_name_or_path=$MODEL_NAME_OR_PATH \
#     --data_file=$DATA_FILE \
#     --extraction 'softmax' \
#     --batch_size 32 \
#     --num_workers 0 \
#     --output_word_file=$OUTPUT_WORDS \
#     --output_prob_file=$OUTPUT_PROB 

In [None]:
parallel_corpus.build_anchor_words_from_awesome_align_output('./alignments.txt')    # Generate Possible Anchor Words

In [None]:
parallel_corpus.load_sorted_anchors('./final_anchors.txt')  # Load Final and Verified Anchors

In [None]:
# # How would the original model translate these anchor words? 
# def translate_anchor_words(src_lang, tgt_lang, output_file):
#     # Set the source and target languages
#     tokenizer.src_lang = src_lang
#     tokenizer.tgt_lang = tgt_lang
#     forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]  # Ensure the target language is correct
# 
#     # Translate and save results
#     with open(output_file, "w", encoding="utf-8") as f:
#         for index, pair in enumerate(parallel_corpus.anchor_words):
#             # Select source and target based on direction
#             source_anchor = pair.zh if src_lang == "zh_CN" else pair.en
#             target_anchor = pair.en if src_lang == "zh_CN" else pair.zh
# 
#             # Tokenize the input text
#             inputs = tokenizer(source_anchor, return_tensors="pt")
#             # Generate translation with forced BOS token for the target language
#             translated_tokens = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
#             # Decode the translated tokens
#             translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
# 
#             # Save the result in the text file
#             f.write(f"{source_anchor}; {target_anchor}; {translation.lower()}\n")
# 
#             if index % 100 == 0:
#                 print(f"Done translating {index} / {len(parallel_corpus.anchor_words)}")
# 


In [None]:
# # Translate English to Chinese
# translate_anchor_words(
#     src_lang="en_XX",
#     tgt_lang="zh_CN",
#     output_file="unmodified_en-zh-translated_anchor_words.txt"
# )

In [None]:
# translate_anchor_words(
#     src_lang="zh_CN",
#     tgt_lang="en_XX",
#     output_file="unmodified_zh-en-translated_anchor_words.txt"
# )

In [None]:
# import Levenshtein
# anchor_count = len(parallel_corpus.anchor_words)
# perfect_match_count = 0
# matching_translations = 0
# with open("unmodified_zh-en-translated_anchor_words.txt", "r", encoding="utf-8") as f:
#     for line in f: 
#         items = line.split(';')
#         zh_anchor = items[0].strip()
#         en_anchor = items[1].strip()
#         translation = items[2].strip()
#         
#         if translation == en_anchor:
#             perfect_match_count += 1
#             matching_translations += 1
# 
# print("Unmodified Accuracy on Chinese Anchor Words (zh->en):", matching_translations / anchor_count)
# print("Perfect Match Count:", perfect_match_count, "out of", anchor_count)

In [None]:
# import Levenshtein
# anchor_count = len(parallel_corpus.anchor_words)
# perfect_match_count = 0
# matching_translations = 0
# with open("unmodified_en-zh-translated_anchor_words.txt", "r", encoding="utf-8") as f:
#     for line in f: 
#         items = line.split(';')
#         en_anchor = items[0].strip()
#         zh_anchor = items[1].strip()
#         translation = items[2].strip()
#             
#         if translation == zh_anchor:
#             perfect_match_count += 1
#             matching_translations += 1
# 
# print("Unmodified Accuracy on English Anchor Words (en->zh):", matching_translations / anchor_count)
# print("Perfect Match Count:", perfect_match_count, "out of", anchor_count)

In [None]:
anchor_words_dict = {en: zh for en, zh in parallel_corpus.anchor_words}

def refactor_sentence_with_anchors(en_sentence, chinese_sentence, anchor_words):
    # Tokenize the sentence into words
    words = en_sentence.split(' ')
    modified_sentence = []
    i = 0
    refactored_chinese_sentence = chinese_sentence.replace(' ', '')
    while i < len(words):
        found = False
        
        # Check for multi-word anchor terms in English
        for length in range(4, 1, -1):  # Check from 4 words (quadgram) to 2 words (bigram)
            if i + length <= len(words):
                multi_word_candidate = ' '.join(words[i:i+length]).lower()  # Make sure we match underscore-separated terms
                # Iterate over the anchor words and check the English part of the pair
                for en_term, zh_term in anchor_words:
                    if multi_word_candidate == en_term:
                        modified_sentence.append(f"<{multi_word_candidate.replace(' ', '_')}>")  # Replace with English term
                        i += length  # Skip the words that are part of the multi-word term
                        found = True
                        refactored_chinese_sentence = refactored_chinese_sentence.replace(anchor_words_dict[en_term], '<'+zh_term+'>')
                        
                        break
                if found:
                    break
        
        if not found:
            # If no multi-word term is found, just add the current word
            modified_sentence.append(words[i])
            i += 1
    
    # Return the modified sentence as a string
    return ' '.join(modified_sentence), refactored_chinese_sentence


# Example to refactor both English and Chinese sentences
refactored_parallel_sentences = []
for index, parallel_sentence in enumerate(parallel_corpus.parallel_sentences):
    # Refactor the English sentence with anchor words
    modified_english_sentence, modified_chinese_sentence = refactor_sentence_with_anchors(parallel_sentence.en, parallel_sentence.zh, parallel_corpus.anchor_words)

    # Append the refactored sentence pair to the list
    refactored_parallel_sentences.append(ParallelSentence(modified_english_sentence, modified_chinese_sentence))
    if index % 1000 == 0: 
        print("Done refactoring", index, "out of", len(parallel_corpus.parallel_sentences))

In [None]:
with open('refactored_parallel_sentences.txt', 'w', encoding='utf-8') as f:
    for pair in refactored_parallel_sentences:
        f.write(f"{pair.en} ; {pair.zh}\n")
        
print("Refactored sentences saved to 'refactored_parallel_sentences.txt'")

In [None]:
parallel_corpus = ParallelCorpus()
parallel_corpus.load_sorted_anchors('./final_anchors.txt')

In [None]:
import os
import csv

def save_translation_to_csv(ps, output_file):
    # Extract English and Chinese texts from the input object
    english_text = ps.en
    chinese_text = ps.zh

    # Create a list of dictionaries containing the data to be saved in the CSV file
    data = [{'zh': chinese_text, 'en': english_text}]

    # Define the CSV file headers
    fieldnames = ['zh', 'en']

    # Check if the CSV file already exists and is non-empty
    file_exists = os.path.isfile(output_file) and os.path.getsize(output_file) > 0

    # Write the data to the CSV file
    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header only if the file is newly created or empty
        if not file_exists:
            writer.writeheader()

        # Write the data row
        writer.writerows(data)


In [None]:
for parallel_sentence in refactored_parallel_sentences:
    save_translation_to_csv(parallel_sentence, 'training_data.csv')

In [None]:
tokens_to_be_added = []
for (en_anchor, zh_anchor) in parallel_corpus.anchor_words:
    tokens_to_be_added.append('<'+en_anchor.replace(' ', '_')+'>')
    tokens_to_be_added.append('<'+zh_anchor+'>')

In [None]:
from datasets import concatenate_datasets, Dataset

In [None]:
import pandas as pd

In [None]:
dataset = './training_data.csv'
dataset1 = pd.read_csv(dataset)

In [None]:
dataset1

In [None]:
train_dataset = dataset1[:175000] 
validation_dataset = dataset1[175000:200000] 

train_dataset_hf = Dataset.from_pandas(train_dataset)
validation_dataset_hf = Dataset.from_pandas(validation_dataset)

In [None]:
from transformers import MBartForConditionalGeneration, MBartTokenizer
import torch

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)

# Add custom tokens and resize model embeddings
tokenizer.add_tokens(tokens_to_be_added)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["zh"], max_length=1024, padding="max_length",truncation=True)

    target_encodings = tokenizer(example_batch["en"], max_length=1024, padding="max_length", truncation=True)

    return {"input_ids": input_encodings["input_ids"],
           "attention_mask": input_encodings["attention_mask"],
           "labels": target_encodings["input_ids"]}

train_dataset_tf = train_dataset_hf.map(convert_examples_to_features, batched=True, remove_columns=["zh","en"])
val_dataset_tf = validation_dataset_hf.map(convert_examples_to_features, batched=True, remove_columns=["zh","en"])

In [None]:
train_dataset_tf

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer,TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir='mbartTrans',
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy='steps',
    save_strategy='no',
    eval_steps=2000,
    logging_steps=1000,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=False,
    learning_rate=2e-5,
    optim="adafactor",
    no_cuda=True  # Forces CPU training
)


In [None]:
import gc
gc.collect()

In [None]:
trainer = Seq2SeqTrainer(model=model, args=training_args, tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=train_dataset_tf,
                        eval_dataset=val_dataset_tf)

trainer.train()