In [6]:
import csv
import jieba

class SentencePair:
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh

def load_parallel_sentences(file_path):
    sentence_pairs = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        # Loop through each row in the CSV file
        for row in reader:
            if len(row) < 7:
                continue
            english_content = row[5]  # Adjust based on the correct column index
            chinese_content = row[6]  # Adjust based on the correct column index
            
            # Split the content into sentences based on '@' delimiters
            english_sentences = english_content.split('@')
            chinese_sentences = chinese_content.split('@')
            
            for eng_sent, chi_sent in zip(english_sentences, chinese_sentences):
                # Use jieba to cut the sentence into words
                words = jieba.lcut(chi_sent)
                
                # Join the words with spaces
                ch_sentence = " ".join(words)
                sentence_pairs.append(SentencePair(eng_sent.strip(), ch_sentence.strip()))  
                
    return sentence_pairs

In [7]:
import os

all_pairs = []
directory_path = 'FTIE'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    # Check if the path is a file and ends with .csv
    if os.path.isfile(file_path) and filename.endswith('.csv'):
        # Load pairs from the current file and extend the all_pairs list
        file_pairs = load_parallel_sentences(file_path)
        all_pairs.extend(file_pairs)

In [9]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)

# Set the source and target languages
tokenizer.src_lang = "zh_CN"  # Use 'zh_CN' for Simplified Chinese
tokenizer.tgt_lang = "en_XX"  # 'en_XX' for English

# Sample Chinese sentence
with open("unmodified_zh-en-translated_sentences.txt", "w", encoding="utf-8") as f:
    print("Translating", len(all_pairs), "sentences")
    for index, pair in enumerate(all_pairs):
        chinese_sentence = pair.zh  # source
        english_sentence = pair.en  # target
        
        # Tokenize the input text
        inputs = tokenizer(chinese_sentence, return_tensors="pt")
        # Generate translation
        translated_tokens = model.generate(**inputs)
        # Decode the translated tokens
        translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        # Save the result in the text file
        f.write(f"{chinese_sentence}; {english_sentence}; {translated_sentence}\n")
        
        if index % 100 == 0: 
            print("Done with", index, "/", len(all_pairs))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Translating 255860 sentences
Done with 0 / 255860
Done with 100 / 255860


KeyboardInterrupt: 