In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

In [2]:
MAX_LEN = 128

In [3]:
en_zh_df = pd.read_csv('news-commentary-v15.en-zh.tsv',sep = '\t', header=None).dropna()

In [4]:
train_df, test_df = train_test_split(en_zh_df, test_size=0.2, random_state=1)

train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [5]:
bilingual_data = train_df.copy()

In [6]:
bilingual_data

Unnamed: 0,0,1
122819,NEW YORK – The world’s high-income countries a...,发自纽约 — — 世界上的高收入国家正陷入主要与经济增长及就业有关的经济困境之中，同时这些危...
259692,"Overall, I see a broad shift from understandin...",总的来说，我看到了从把资产理解为你所占有的东西，到把资产运作起来产生效用的显著转变。
15499,"From 1992 to 2011, labor productivity grew at ...",从1992到2011年，意大利的劳动生产率每年平均只增长0.9 % ， 在经合组织成员国中倒...
124254,BEIJING – In what some might consider a surpri...,北京—在当今世界最大的二氧化碳排放国中国，正在发生可能让有些人感到惊奇的事情 — — 随着中...
263896,Reduced European dependence on the US export m...,欧洲对美国出口市场依赖的减少并不意味着欧洲就不会受美国经济疲软的影响，如果欧元在对美元增值的...
...,...,...
3097,A Balkan Monitor survey recently conducted by ...,但就在最近，由盖洛普欧洲完成的一份名为《巴尔干观察》的调查给出了自柏林墙倒塌20年以及科索沃...
247052,The event is so hideous that it seems like a b...,Lo ocurrido es tan espantoso que parece una br...
264784,CAMBRIDGE – The opaque nature of China’s gover...,发自剑桥 — — 中国政府的不透明本质使外界难以摸清中国经济政策的走向，从而也很难揣摩中国经...
80247,China soon surpassed the US in other important...,很快，中国在其他重要方面也超过了美国。


In [7]:
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [85]:
from collections import Counter

def identify_anchor_points(source_corpus, top_n=50):
    # Tokenize and find word frequencies in both source (Chinese) and target (English) corpora
    src_tokens = [token for sentence in source_corpus for token in sentence.split()]

    # Get the most common words in each language
    src_common = [word for word, freq in Counter(src_tokens).most_common(top_n)]

    # Intersection of common terms between both languages
    return src_common

In [87]:
src_common = identify_anchor_points(bilingual_data[0], 1000)

In [None]:
src_common[300:400]

In [8]:
# Example anchor words for alignment (These should be selected based on similarity across languages)
anchor_points = {"PARIS": "巴黎", "Europe": "欧洲", "US": "美国", "China": "中国", "Trump": "特朗普", "Russia": "俄罗斯", "Obama": "奥巴马"}

In [9]:
def embed_anchor_points(bilingual_data, anchor_points):
    for src_word, tgt_word in anchor_points.items():
        bilingual_data[0] = bilingual_data[0].str.replace(src_word, f"<<{src_word}>>", regex=False)
        bilingual_data[1] = bilingual_data[1].str.replace(tgt_word, f"<<{tgt_word}>>", regex=False)
    return bilingual_data

bilingual_data = embed_anchor_points(bilingual_data, anchor_points)

In [10]:
from torch.utils.data import DataLoader, Dataset

# Loading bilingual educational data
class BilingualDataset(Dataset):
    def __init__(self, bilingual_data, tokenizer, max_length=MAX_LEN):
        self.data = bilingual_data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_text, tgt_text = self.data.iloc[idx]
        src_encodings = self.tokenizer(src_text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)
        tgt_encodings = self.tokenizer(tgt_text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)

        return {
            "idx": idx,
            "input_ids": src_encodings["input_ids"].squeeze(),
            "attention_mask": src_encodings["attention_mask"].squeeze(),
            "labels": tgt_encodings["input_ids"].squeeze()
        }

In [11]:
# Instantiate dataset and dataloader
dataset = BilingualDataset(bilingual_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [12]:
import torch.nn.functional as F

def custom_loss_function(output_logits, target_ids, anchor_mask):
    """
    Compute custom loss with additional emphasis on anchor points.
    
    Parameters:
    - output_logits: Model's output logits
    - target_ids: Actual target token IDs
    - anchor_mask: Mask indicating anchor positions in the target sequence
    
    Returns:
    - Loss with penalties on anchor errors
    """
    loss = F.cross_entropy(output_logits.view(-1, output_logits.size(-1)), target_ids.view(-1), reduction="none")
    
    # Penalty: amplify loss for anchor point errors
    # print(loss.view(target_ids.shape).shape)
    # print(anchor_mask.shape)
    loss = loss.view(target_ids.shape) * (1 + anchor_mask * 2)  # Triple the loss weight for anchors
    return loss.mean()

In [13]:
def create_anchor_mask(tokenizer, bilingual_data, anchor_points):
    """
    Create a mask to identify anchor point tokens in the target sequence.
    
    Parameters:
    - tokenizer: The tokenizer for the model
    - sentences: List of target sentences
    - anchor_points: List of anchor words
    
    Returns:
    - Mask tensor indicating anchor tokens
    """
    anchor_mask = []
    for sentence in bilingual_data:
        mask = [1 if word in anchor_points else 0 for word in tokenizer.tokenize(sentence)]
        mask = mask + [0] * (MAX_LEN - len(mask))  # pad to max length if needed
        anchor_mask.append(mask[:MAX_LEN])
    return torch.tensor(anchor_mask)

In [14]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [18]:
# Training loop with custom loss function
model.train()
epochs = 10

for epoch in range(epochs):
    for batch in tqdm(dataloader):
        # print(batch["idx"])
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        labels = batch["labels"].to(model.device)
        # print(input_ids)
        # print(attention_mask)
        # print(labels)

        # Generate anchor mask
        anchor_mask = create_anchor_mask(tokenizer, bilingual_data.iloc[batch["idx"]][1], anchor_points.items()).to(model.device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        # Calculate custom loss with anchor points
        loss = custom_loss_function(outputs.logits, labels, anchor_mask)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

  0%|                                                                               | 4/2928 [00:41<8:27:21, 10.41s/it]


KeyboardInterrupt: 

In [56]:
model.eval()
new_sentence = "The tendency is either excessive restraint (Europe) or a diffusion of the effort (the United States)."  # Test sentence with anchor points
input_ids = tokenizer.encode(new_sentence, return_tensors="pt")

# Generate translation
with torch.no_grad():
    generated_ids = model.generate(input_ids)
    translation = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f"English: {new_sentence}")
print(f"Chinese Translation: {translation}")

English: The tendency is either excessive restraint (Europe) or a diffusion of the effort (the United States).
Chinese Translation: 趋势要么是过度限制(欧洲),要么是分散努力(美国)。


In [16]:
from sacrebleu.metrics import BLEU
from tqdm import tqdm

In [83]:
def evaluate_model(model, tokenizer, val_df):
    model.eval()
    bleu = BLEU(effective_order=True)  # Enable effective_order for sentence-level BLEU
    total_bleu_score = 0.0

    for idx in tqdm(range(len(val_df))):
        # Tokenize and encode the source sentence
        input_ids = tokenizer.encode(val_df.iloc[idx][0], return_tensors="pt").to(model.device)

        # Generate translation
        with torch.no_grad():
            generated_ids = model.generate(input_ids)
            translation = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Calculate sentence-level BLEU score
        sentence_bleu = bleu.sentence_score(translation, [val_df.iloc[idx][1]]).score
        total_bleu_score += sentence_bleu

    # Compute the average BLEU score across all sentences
    avg_bleu_score = total_bleu_score / len(val_df)
    print(f"Average BLEU Score: {avg_bleu_score}")

    # If you want to add ROUGE or other metrics, you can compute them here

    return avg_bleu_score

In [84]:
# Evaluate the model
avg_bleu_score = evaluate_model(model, tokenizer, val_df.head(100))
print(f"Validation BLEU Score: {avg_bleu_score}")

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00,  1.75it/s]

Average BLEU Score: 1.5332670784991174
Validation BLEU Score: 1.5332670784991174





In [75]:
val_df.head()

Unnamed: 0,0,1
2284,Unless the democratic world understands that n...,除非民主世界明白现在决非相信外交妥协的时候、必须以足够强硬的立场阻止普京的帝国图谋，否则事件...
98676,“Liberal” cycles are followed by “conservative...,“自由主义”周期紧接着就是“保守主义”周期，然后“保守主义”周期又会让位于新的“自由主义”周...
185028,The sites – six of which are being funded by t...,它旗下的站点 — — 其中六个由比尔和梅琳达·盖茨基金会出资，前三年提供730万美元初始资金...
17323,"In Ukraine, Putin would be happy to turn a pea...",在乌克兰，普京乐于见到和平反对派驱逐腐败政府演变为内战。
79280,"In other words, $100 a half-century from now i...",换句话讲，按照不同的假定利率，50年后的100美元今天只值15美元、10美元甚至更低。
