In [7]:
# Imports
import csv  
import stanza
import os
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
import re 
from collections import defaultdict, Counter

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('zh') 
nlp = stanza.Pipeline('zh', processors='tokenize')

# Get the set of stopwords and punctuation
stop_words = set(stopwords.words('english')) 
stop_words.update(
    {'cent', 'href=', 'http', 'says', 'told', 'year', 'ago', 'yesterday', 'since', 'last', 'past', 'next',
     'said', 'almost', 'within', 'would', 'nearly', 'years', 'months', 'according', 'compared', 'go', 'also', 
     "n't"})  
punctuation_set = set(punctuation)
punctuation_set.update({"’", "’", '”', "''", "“", "'s", '--', 'b', '/b', '/strong', '–', '—'})

[nltk_data] Downloading package punkt to /Users/vnnsnnt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vnnsnnt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 11.5MB/s]                    
2024-11-26 01:08:49 INFO: Downloaded file to /Users/vnnsnnt/stanza_resources/resources.json
2024-11-26 01:08:49 INFO: "zh" is an alias for "zh-hans"
2024-11-26 01:08:49 INFO: Downloading default packages for language: zh-hans (Simplified_Chinese) ...
2024-11-26 01:08:50 INFO: File exists: /Users/vnnsnnt/stanza_resources/zh-hans/default.zip
2024-11-26 01:08:52 INFO: Finished downloading models and saved to /Users/vnnsnnt/stanza_resources
2024-11-26 01:08:52 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with downlo

In [17]:
# Data Structures
class ParallelSentence: 
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class AnchorWord:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh
        
class ParallelCorpus: 
    def __init__(self):
        self.parallel_sentences = []
        self.multi_grams_to_consider = []
        self.anchor_words = []
        
    def load_parallel_sentences(self, data_source):
        parallel_sentences = []
        for file in os.listdir(data_source):
            file_path = os.path.join(data_source, file)
            with open(file_path, mode='r', encoding='utf-8') as data_file:
                reader = csv.reader(data_file, delimiter=';')
                for row in reader:
                    if len(row) < 7: continue   # escape bad data
                    english_content = row[5]    # get english sentences
                    chinese_content = row[6]    # get chinese sentences
        
                    # break apart sentence content by @ delimiter
                    english_sentences = english_content.split('@')  
                    chinese_sentences = chinese_content.split('@')
                    
                    for english_sentence, chinese_sentence in zip(english_sentences, chinese_sentences):
                        clean_english_sentence = english_sentence.strip()
                        
                        # Process the Chinese sentence with Stanza
                        doc = nlp(chinese_sentence)  
                        chinese_tokens = [word.text for sentence in doc.sentences for word in sentence.words]
                        clean_chinese_sentence = " ".join(chinese_tokens)
                        
                        parallel_sentences.append(ParallelSentence(clean_english_sentence, clean_chinese_sentence))
                        
        self.parallel_sentences = parallel_sentences
    
    def generate_multi_grams(self):
        bigrams = self.extract_ngram_counts(n=2).most_common()[:5000]
        trigrams = self.extract_ngram_counts(n=3).most_common()[:3000]
        quadgrams = self.extract_ngram_counts(n=4).most_common()[:1000]
        
        multi_grams_to_consider = set()
        # Add multi-word terms from quad_grams_to_consider
        for quad_gram in quadgrams:
            multi_word_term = '_'.join(quad_gram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from trigrams_to_consider
        for trigram in trigrams:
            multi_word_term = '_'.join(trigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        # Add multi-word terms from bigrams_to_consider
        for bigram in bigrams:
            multi_word_term = '_'.join(bigram[0])
            multi_grams_to_consider.add(multi_word_term)
        
        self.multi_grams_to_consider = multi_grams_to_consider
        
    @staticmethod
    def refactor_sentence_with_multiword_term(sentence, multi_word_terms):
        # Tokenize the sentence into words
        words = sentence.split(' ')
        modified_sentence = []
        i = 0
        while i < len(words):
            found = False
            
            # Check for quadgrams (4-word sequences)
            for length in range(4, 1, -1):  # Check for quadgram to bigram
                if i + length <= len(words):
                    multi_word_candidate = '_'.join(words[i:i+length]).lower()
                    if multi_word_candidate in multi_word_terms:
                        # If a match is found, replace the words with the multi-word term
                        modified_sentence.append(multi_word_candidate)
                        i += length
                        found = True
                        break
            
            if not found:
                # If no match is found, just add the word as is
                modified_sentence.append(words[i])
                i += 1

        # Return the modified sentence as a string
        return ' '.join(modified_sentence)
    
    def extract_ngram_counts(self, n):
        ngram_counts = Counter()
        for parallel_sentence in self.parallel_sentences:
            tokens = nltk.word_tokenize(parallel_sentence.en)
            # Filter out stopwords, punctuation, and numbers
            filtered_tokens = [token.lower() for token in tokens 
                               if token.lower() not in stop_words 
                               and token not in punctuation_set 
                               and not token.isdigit()] 
    
            # Generate n-grams for the filtered tokens
            ngram_list = ngrams(filtered_tokens, n)
            # Count the frequency of each n-gram
            ngram_counts.update(ngram_list)
        return ngram_counts
    
    def format_parallel_sentences_for_awesome_align(self):
        with open("zhen.src-tgt", "w") as f:
            for parallel_sentence in self.parallel_sentences:
                modified_sentence = self.refactor_sentence_with_multiword_term(parallel_sentence.en, self.multi_grams_to_consider)
                f.write(f"{modified_sentence} ||| {parallel_sentence.zh}\n")
    
    def build_anchor_words_from_awesome_align_output(self, alignments_path):
        anchor_words = []
        with open(alignments_path, 'r', encoding='utf-8') as file:
            for line in file:
                alignment_pairs = line.strip().split(' ')
                for index, pair in enumerate(alignment_pairs):
                    en_entry, zh_entry = pair.split('<sep>')[0], pair.split('<sep>')[1]
                    if en_entry not in self.multi_grams_to_consider: continue
                    # Clean the English entry
                    cleaned_en_entry = re.sub(r'[^a-zA-Z_]', '', en_entry)
                    
                    # Append only if conditions are met
                    if cleaned_en_entry:
                        if anchor_words and anchor_words[len(anchor_words)-1].en == cleaned_en_entry:
                            if zh_entry not in anchor_words[len(anchor_words)-1].zh:
                                anchor_words[len(anchor_words)-1].zh += zh_entry
                        else:
                            anchor_words.append(AnchorWord(cleaned_en_entry, zh_entry))
                            
        unique_anchors = set(AnchorWord(anchor.en, anchor.zh) for anchor in anchor_words)
        
        # Step 1: Count frequencies of `zh` entries for each `en`
        anchor_freq = defaultdict(Counter)
    
        for anchor in unique_anchors:
            anchor_freq[anchor.en][anchor.zh] += 1
        
        # Step 2: Select the most frequent `zh` entry for each `en`
        filtered_alignments = []
        for en, zh_counter in anchor_freq.items():
            most_frequent_zh = zh_counter.most_common(1)[0][0]  # Get the most frequent `zh`
            filtered_alignments.append(AnchorWord(en, most_frequent_zh))
        
        # Step 3: Sort alphabetically by `en`
        sorted_filtered_anchors = sorted(filtered_alignments, key=lambda anchor: anchor.en)
        
        # Step 4: Write to file
        with open('possible-anchors.txt', 'w') as file:
            for alignment in sorted_filtered_anchors:
                file.write(f"{alignment.en} {alignment.zh}\n")
    
    def load_sorted_anchors(self, anchor_path):
        anchors = []
        with open(anchor_path, 'r') as file:
            for line in file: 
                alignment = line.strip().split(' ')
                en = alignment[0].replace('_', ' ')
                zh = alignment[1] 
                anchors.append(AnchorWord(en, zh))
        self.anchor_words = anchors

In [9]:
parallel_corpus = ParallelCorpus()  # Initialize Corpus Object

In [10]:
parallel_corpus.load_parallel_sentences(data_source='./FTIE/')  # Load parallel sentences from data source

In [14]:
parallel_corpus.generate_multi_grams()  # Generate Multi grams e.g Asian Financial Crisis -> asian_financial_crisis

In [None]:
# parallel_corpus.format_parallel_sentences_for_awesome_align() # Format English Sentence With Multi Grams 
# Prepare a data source for awesome align 

# DATA_FILE=./zhen.src-tgt
# MODEL_NAME_OR_PATH=./model_without_co
# OUTPUT_FILE=./output.txt
# OUTPUT_WORDS=./alignments.txt
# OUTPUT_PROB=./alignments-prob.txt
# 
# CUDA_VISIBLE_DEVICES=0 awesome-align \
#     --output_file=$OUTPUT_FILE \
#     --model_name_or_path=$MODEL_NAME_OR_PATH \
#     --data_file=$DATA_FILE \
#     --extraction 'softmax' \
#     --batch_size 32 \
#     --num_workers 0 \
#     --output_word_file=$OUTPUT_WORDS \
#     --output_prob_file=$OUTPUT_PROB 

In [20]:
parallel_corpus.build_anchor_words_from_awesome_align_output('./alignments.txt')    # Generate Possible Anchor Words

In [23]:
parallel_corpus.load_sorted_anchors('./final_anchors.txt')  # Load Final and Verified Anchors

In [None]:
# from transformers import MBartForConditionalGeneration, MBartTokenizer
# 
# # Load mBART model and tokenizer
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model = MBartForConditionalGeneration.from_pretrained(model_name)
# tokenizer = MBartTokenizer.from_pretrained(model_name)
# 
# # Set the source and target languages
# tokenizer.src_lang = "zh_CN"  # Use 'zh_CN' for Simplified Chinese
# tokenizer.tgt_lang = "en_XX"  # 'en_XX' for English
# 
# # Sample Chinese sentence
# with open("unmodified_zh-en-translated_sentences.txt", "w", encoding="utf-8") as f:
#     print("Translating", len(all_pairs), "sentences")
#     for index, pair in enumerate(all_pairs):
#         chinese_sentence = pair.zh  # source
#         english_sentence = pair.en  # target
#         
#         # Tokenize the input text
#         inputs = tokenizer(chinese_sentence, return_tensors="pt")
#         # Generate translation
#         translated_tokens = model.generate(**inputs)
#         # Decode the translated tokens
#         translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
#         
#         # Save the result in the text file
#         f.write(f"{chinese_sentence}; {english_sentence}; {translated_sentence}\n")
#         
#         if index % 100 == 0: 
#             print("Done with", index, "/", len(all_pairs))