In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

tcn = pd.read_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/datasets/dev_tcn.csv')
en  = pd.read_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/datasets/dev_en.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install sacrebleu



In [None]:
# Import the BLEU metric computation module from sacrebleu library
from sacrebleu.metrics import BLEU

def get_score(pred, actual):
    """
    Computes the BLEU score between predicted sentences and actual reference sentences.

    Args:
        pred (list or array-like): List/array of predicted (generated) sentences.
        actual (list or array-like): List/array of ground truth reference sentences.

    Returns:
        sacrebleu.BLEUScore: A BLEU score object containing the corpus-level score and breakdown.
    """

    # Convert predictions to a list (if they're in a tensor/array format)
    hypotheses = pred.tolist()

    # Convert references to a list and wrap each in another list (sacreBLEU expects format [[ref1], [ref2], ...])
    references = actual.tolist()
    references = [[ref] for ref in references]  # Each reference must be in a list (for compatibility with multiple references per hypothesis)

    # Initialize the BLEU scorer
    bleu = BLEU()

    # Compute and return the corpus-level BLEU score
    return bleu.corpus_score(hypotheses, references)

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import os # import the os module
import torch

# Check if TPU is available
if 'COLAB_TPU_ADDR' in os.environ:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  strategy = tf.distribute.TPUStrategy(resolver)
  print('Running on TPU')
  device = xm.xla_device()
else:
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print('Running on', device)

# Load the pre-trained model and tokenizer for Chinese-to-English translation
# Model: 'Helsinki-NLP/opus-mt-zh-en' (OPUS-MT, a multilingual translation model)
model_name = 'Helsinki-NLP/opus-mt-zh-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)  # Handles text splitting/encoding
model = MarianMTModel.from_pretrained(model_name)        # The actual translation model

def translate_tcn(text):
    """
    Translates Chinese text to English using a pre-trained Helsinki-NLP model.

    Args:
        text (str): Input Chinese text to be translated.

    Returns:
        str: Translated English text.
    """
    # Tokenize the input text and convert to PyTorch tensors (format expected by the model)
    inputs = tokenizer(text, return_tensors="pt")  # "pt" = PyTorch tensor format

    # Generate translation using the model
    translated = model.generate(**inputs)  # Unpacks tokenized inputs (e.g., input_ids, attention_mask)

    # Decode the generated tokens back to a human-readable string, skipping special tokens (e.g., <pad>, </s>)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

Running on cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tcn.head()

Unnamed: 0,text,split
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,private
1,SOFT 99 鍍膜車蠟(強力撥水型),private
2,低糖芒果乾 250g 臻御行,private
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,private
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,private


# translation without modification for chinese and english names

In [None]:
from tqdm import tqdm # Import tqdm for progress bar

# Apply the translation function `translate_tcn` to each Chinese text in the DataFrame column `tcn['text']`
en['text_translated'] = [translate_tcn(text) for text in tqdm(tcn['text'])] # Alternative to .apply()

100%|██████████| 1000/1000 [27:41<00:00,  1.66s/it]


NameError: name 'translation_output' is not defined

In [None]:
en

Unnamed: 0,translation_output,text_translated
0,Oppo A75 A75S A73 Phone Case Soft Rabbit Silic...,"OPPO A75 A75s A73 Cell phone shell, soft shell..."
1,SOFT 99 Coating Car Wax Strong Water Watt,SOFT 99 Diambrane Wax (strength-dip)
2,Low Sugar Mango Dry 250g Be The Royal,"Low-sugar mango dry, 250 g."
3,* the culture Japan Imported Round Top Space C...,*Title culture* Japanese import ROUND TOP spac...
4,Hello Kitty Sandals Shoes White/Red Children n...,"Hello, Kitty. Kitty, sandals, boys' shoes, whi..."
...,...,...
995,Hippored Torn Fun Unique Style Straight Jeans ...,[HippoRed] Tore the fun with a unique style of...
996,Kids Set Table Bay - Thin Long Sleeve Home Sui...,"Children's suits, Taiwan's thin-sleeved sleeve..."
997,LONGCHAMP Le Pliage Neo High Density Nylon Bac...,LONGCHAMP Le Pliage Neo High Minilon Backpack ...
998,IFairies Opening Adjustable Ring ifairies [564...,iFairies.


In [None]:
get_score(en['text_translated'], en['translation_output'])

BLEU = 6.05 52.6/5.6/2.9/1.6 (BP = 1.000 ratio = 1.000 hyp_len = 19 ref_len = 19)

# Score for translation with lowercase

In [None]:
get_score(en['text_translated'].str.lower(), en['translation_output'].str.lower())

BLEU = 19.26 84.2/22.2/11.8/6.2 (BP = 1.000 ratio = 1.000 hyp_len = 19 ref_len = 19)

# Score for translation removed special char and strange space

In [None]:
import re

def cleaning_string(my_string):
    my_string = re.sub(r"[^a-z0-9 ]+", ' ', my_string.lower()) # lowercase then change special char to ''
    my_string = " ".join(my_string.split()) # remove white space

    return my_string

In [None]:
get_score(en['text_translated'].map(cleaning_string), en['translation_output'].map(cleaning_string))

BLEU = 27.23 80.0/35.7/23.1/8.3 (BP = 1.000 ratio = 1.000 hyp_len = 15 ref_len = 15)

In [None]:
en['text_translated_processed'] = en['text_translated'].str.lower().map(cleaning_string)
en['translation_output_processed'] = en['translation_output'].str.lower().map(cleaning_string)

In [None]:
import re

def remove_duplicated_words(text):
    # Split text into words, handling punctuation
    words = re.findall(r'\b\w+\b', text)
    seen = set()
    unique_words = []
    for word in words:
        if word.lower() not in seen:
            seen.add(word.lower())
            unique_words.append(word)
    return " ".join(unique_words)

input_text = "hello, hello kitty! kitty sandals boys shoes whites reds no no no no no no no no no no no no no no no no no no no no"
output_text = remove_duplicated_words(input_text)
print(output_text)

hello kitty sandals boys shoes whites reds no


In [None]:
get_score(en['text_translated_processed'].map(remove_duplicated_words), en['translation_output_processed'])

BLEU = 34.33 91.7/45.5/30.0/11.1 (BP = 1.000 ratio = 1.000 hyp_len = 12 ref_len = 12)

In [None]:
en['text_translated_processed'] = en['text_translated_processed'].map(remove_duplicated_words)
en.to_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/preprocessed/translation.csv', index=False)