In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import unicodedata
import re
import numpy as np
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split

from pylab import *
from matplotlib.font_manager import FontProperties
print('tf.__version__.is', tf.__version__)
print('tf.keras.__version__ is:', tf.keras.__version__)
print(tf.config.experimental.list_physical_devices('GPU'))

tf.__version__.is 2.5.0
tf.keras.__version__ is: 2.5.0
[]


Dataset loading

In [4]:
tcn_clean = pd.read_csv('./preprocessed/cleaned_dev_tcn.csv')
tcn_clean.head()

Unnamed: 0,text,split,product_title_clean,product_title_processed
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,private,手机壳软壳挂绳壳大眼兔硅胶壳,手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳
1,SOFT 99 鍍膜車蠟(強力撥水型),private,鍍膜車蠟強力撥水型,鍍膜車 蠟 強力 撥水型
2,低糖芒果乾 250g 臻御行,private,低糖芒果乾臻御行,低糖 芒果乾 臻 御行
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,private,小徑文化日本進口,小徑 文化 日本 進口
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,private,凱蒂貓涼鞋童鞋白紅色小童,凱蒂貓 涼鞋 童鞋 白紅色 小童


In [5]:
en_clean = pd.read_csv('./preprocessed/cleaned_dev_en.csv')
en_clean.head()

Unnamed: 0,translation_output
0,oppo phone case soft rabbit silicone case
1,soft coating car wax strong water watt
2,low sugar mango dry be the royal
3,the culture japan imported round top space cra...
4,hello kitty sandals shoes white red children


Preprocessing (change from unicode to ascii & add \<start\> and \<end\> tokens

In [6]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [7]:
def preprocess(w):
    w = unicode_to_ascii(w.lower().strip())

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [8]:
# u means unicode encoder
en_sentence = en_clean['translation_output'][0]
chn_sentence = tcn_clean['product_title_processed'][0]
print(preprocess(en_sentence))
print(preprocess(chn_sentence))
print(preprocess(chn_sentence).encode('utf-8'))

<start> oppo phone case soft rabbit silicone case <end>
<start> 手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳 <end>
b'<start> \xe6\x89\x8b\xe6\x9c\xba \xe5\xa3\xb3 \xe8\xbd\xaf\xe5\xa3\xb3 \xe6\x8c\x82\xe7\xbb\xb3 \xe5\xa3\xb3 \xe5\xa4\xa7\xe7\x9c\xbc \xe5\x85\x94 \xe7\xa1\x85\xe8\x83\xb6 \xe5\xa3\xb3 <end>'


Prepare Embedding Layer

In [9]:
def create_dataset(target, source):
    assert len(source) == len(target)
    word_pairs = [[preprocess(str(target[i])), preprocess(str(source[i]))]
                  for i in range(len(source))]

    # return two tuple: one tuple includes all English sentenses, and 
    # another tuple includes all Chinese sentenses
    return word_pairs

word_pairs = create_dataset(en_clean['translation_output'], tcn_clean['product_title_processed'])
word_pairs[:20]

[['<start> oppo phone case soft rabbit silicone case <end>',
  '<start> 手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳 <end>'],
 ['<start> soft coating car wax strong water watt <end>',
  '<start> 鍍膜車 蠟 強力 撥水型 <end>'],
 ['<start> low sugar mango dry be the royal <end>',
  '<start> 低糖 芒果乾 臻 御行 <end>'],
 ['<start> the culture japan imported round top space craft diamond sc mk <end>',
  '<start> 小徑 文化 日本 進口 <end>'],
 ['<start> hello kitty sandals shoes white red children <end>',
  '<start> 凱蒂貓 涼鞋 童鞋 白紅色 小童 <end>'],
 ['<start> thunderbird lt japanese wood grain table mat x cm pc <end>',
  '<start> 雷鳥 日式 木紋 桌墊片 <end>'],
 ['<start> the sand kid dvd vol <end>', '<start> 狂砂 小子 <end>'],
 ['<start> anacomda python titan series ssd drive <end>',
  '<start> 巨蟒 泰坦 系列 固態 硬碟 <end>'],
 ['<start> ifairies shoulder messenger bag shoulder bag <end>',
  '<start> 單肩 斜 背包 側 背包 肩 背包 <end>'],
 ['<start> the farm mountain bitter gourd tea pack box <end>',
  '<start> 大雪山 農場 山 苦瓜 茶 包盒 <end>'],
 ['<start> colgate dual clean toothbrush <

In [10]:
en, chn = zip(*create_dataset(en_clean['translation_output'], tcn_clean['product_title_processed']))
print(en[-1])
print(chn[-1])
# show the size of the dataset
assert len(en) == len(chn)
print("Size:", len(en))

<start> polarstar women sweat quick dry t shirt black <end>
<start> 女排 汗 快干 恤 黑 <end>
Size: 1000


In [11]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='')
    # generate a dictionary, e.g. word -> index(of the dictionary)
    lang_tokenizer.fit_on_texts(lang)

    # output the vector sequences, e.g. [1, 7, 237, 3, 2]
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # padding sentences to the same length
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    return tensor, lang_tokenizer

In [12]:
def load_dataset(target, source):
    # creating cleaned input, output pairs
    # regard Chinese as source sentence, regard English as target sentence
    targ_lang, inp_lang = zip(*create_dataset(target, source))

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [13]:
def max_length(tensor):
    # padding the sentence to max_length
    return max(len(t) for t in tensor)

In [14]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(en_clean['translation_output'], 
                                                                tcn_clean['product_title_processed'])

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(
    target_tensor), max_length(input_tensor)

# Creating training and validation sets using an 95-5 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.05)

# Show length of the training data and validation data
print("# training data: {:d}\n# test data: {:d}".format(len(input_tensor_train), len(input_tensor_val)))

# training data: 950
# test data: 50


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print("%d ----> %s" % (t, lang.index_word[t]))

print("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print()
print("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
277 ----> 凱蒂貓
112 ----> 涼鞋
152 ----> 童鞋
849 ----> 白紅色
278 ----> 小童
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
181 ----> hello
150 ----> kitty
182 ----> sandals
3 ----> shoes
13 ----> white
76 ----> red
207 ----> children
2 ----> <end>


Initialise Parameters 

In [16]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
# 0 is a reserved index that won't be assigned to any word, so the size of vocabulary should add 1
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

2025-03-11 17:00:34.020954: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(TensorShape([128, 18]), TensorShape([128, 17]))

In [17]:
print('vocab_inp_size =', vocab_inp_size)
print('vocab_tar_size =', vocab_tar_size)

vocab_inp_size = 3224
vocab_tar_size = 2494


Encoder class construction

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        # vacab_size=vocab_inp_size=3224, embedding_dim=256 enc_units=1024 batch_sz=128
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        # x is the training data with shape == (batch_size, max_length)  -> (128, 18)
        # which means there are batch_size sentences in one batch, the length of each sentence is max_length
        # hidden state shape == (batch_size, units) -> (128, 1024)
        # after embedding, x shape == (batch_size, max_length, embedding_dim) -> (128, 18, 256)
        x = self.embedding(x)
        
        # output contains the state(in GRU, the hidden state and the output are same) from all timestamps,
        # output shape == (batch_size, max_length, units) -> (128, 18, 1024)
        # state is the hidden state of the last timestamp, shape == (batch_size, units) -> (128, 1024)
        output, state = self.gru(x, initial_state=hidden)
        
        return output, state

    def initialize_hidden_state(self):
        # initialize the first state of the gru,  shape == (batch_size, units) -> (128, 1024)
        return tf.zeros((self.batch_sz, self.enc_units))

In [19]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

# the output and the hidden state of GRU is equal
print(sample_output[-1, -1, :] == sample_hidden[-1, :])

Encoder output shape: (batch size, sequence length, units) (128, 18, 1024)
Encoder Hidden state shape: (batch size, units) (128, 1024)
tf.Tensor([ True  True  True ...  True  True  True], shape=(1024,), dtype=bool)


Define Attention layer

In [20]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape == (batch_size, max_length, hidden_size)
        context_vector = attention_weights * values
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


In [21]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (128, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (128, 18, 1)


Decoder class construction

In [22]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        # vocab_size=vocab_tar_size=2494, embedding_dim=256, dec_units=1024, batch_sz=128
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        # the dimension of the output is the vocab size, through the softmax function,
        # this layer will return the probability of each word in the dictory
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # This function outputs a result at each timestamp
        # The hidden state of fisrt timestamp in the decoder is 
        # the hidden state of last timestamp in the encoder
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # concatenate the input x and the context_vector, as the input of the GRU
        # context_vector shape == (batch_size, units) -> (128, 1024)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size) -> (128, 1, 1024 + 256)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        # get the output and state of the current timestamp
        # output shape == (batch_size, 1, units) -> (128, 1, 1024) 
        # state shape == (batch_size, units) -> (128, 1024)
        output, state = self.gru(x)

        # output shape == (batch_size, hidden_size) -> (128, 1024)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab) -> (128, 2494)
        x = self.fc(output)

        return x, state, attention_weights

In [23]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)
print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (128, 2494)


Define optimizer and loss function

In [24]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    """Calculate the loss value
    Args:
        real: the true label  shape == (batch_size,) -> (128,)
        pred: the probability of each word from the vocabulary, is the output from the decoder 
                 shape == (batch_size, vocab_size) -> (128, 6082)

    Returns: 
        the average loss of the data in a batch size
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

Checkpoints saving

In [25]:
checkpoint_dir = './checkpoints/tcn-eng'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

Define Training

In [26]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden
        
        # feed the <start> as the first input of the decoder
        # dec input shape == (batch_size, 1) -> (128, 1)
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        # because of the data preprocessing(add a start token to the sentence)
        # the first word is <start>, so t starts from 1(not 0)
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            # targ[:, t] is the true label(index of the word) of every sentence(in a batch) 
            # at the current timestamp
            # like [  85   18   25   25  ···  1047   79   13], shape == (batch_size,) -> (128,)
            # predictions shape == (batch_size, vocab_size) -> (128, 6082)
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    # collect all trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables

    # calculate the gradients for the whole variables
    gradients = tape.gradient(loss, variables)

    # apply the gradients on the variables
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [27]:
EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()

    # get the initial hidden state of gru
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

2025-03-11 17:01:10.440873: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1 Batch 0 Loss 3.9756
Epoch 1 Loss 3.8731
Time taken for 1 epoch 27.74392604827881 sec

Epoch 2 Batch 0 Loss 3.8295
Epoch 2 Loss 3.7110
Time taken for 1 epoch 14.016651630401611 sec

Epoch 3 Batch 0 Loss 3.4079
Epoch 3 Loss 3.4706
Time taken for 1 epoch 14.622299194335938 sec

Epoch 4 Batch 0 Loss 3.4956
Epoch 4 Loss 3.3956
Time taken for 1 epoch 15.022383689880371 sec

Epoch 5 Batch 0 Loss 3.2452
Epoch 5 Loss 3.3492
Time taken for 1 epoch 14.500324010848999 sec

Epoch 6 Batch 0 Loss 3.3148
Epoch 6 Loss 3.3049
Time taken for 1 epoch 16.9169340133667 sec

Epoch 7 Batch 0 Loss 3.1949
Epoch 7 Loss 3.2631
Time taken for 1 epoch 18.21966290473938 sec

Epoch 8 Batch 0 Loss 3.1386
Epoch 8 Loss 3.2259
Time taken for 1 epoch 15.763952016830444 sec

Epoch 9 Batch 0 Loss 3.2027
Epoch 9 Loss 3.1731
Time taken for 1 epoch 17.889650106430054 sec

Epoch 10 Batch 0 Loss 3.2285
Epoch 10 Loss 3.1325
Time taken for 1 epoch 14.284749031066895 sec

Epoch 11 Batch 0 Loss 3.0848
Epoch 11 Loss 3.0834
Ti

Evaluation (Translation)

In [28]:
def evaluate(sentence):
    """Translate a sentence
    Args:
        sentence: the test sentence        
    """
    
    # max_length_targ 38, max_length_inp 64
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess(sentence)

    # convert each word to the index in the test sentence
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    # hidden shape == (1, 1024)
    hidden = [tf.zeros((1, units))]
    
    # enc out shape == (1, max_length_inp, 1024) -> (1, 18, 1024)
    # enc hidden shape == (1, 1024)
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weigths to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        # get the index which has the highest probability
        predicted_id = tf.argmax(predictions[0]).numpy()
        #  convert the index to the word
        result += targ_lang.index_word[predicted_id] + ' '

        # when the decoder predicts the end, stop prediction
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted id is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


def translate(sentence):
    result, sentence, attention_plot= evaluate(sentence)
    return result.rpartition('<end>')[0].strip()


Restore the latest checkpoint and translate test set

In [29]:
checkpoint_dir = './checkpoints/tcn-eng'
print(tf.train.latest_checkpoint(checkpoint_dir))

# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

./checkpoints/tcn-eng/ckpt-20


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa74279fb38>

In [30]:
test_set = pd.read_csv('./preprocessed/cleaned_test_tcn.csv').drop('split', axis=1)

In [31]:
test_set

Unnamed: 0,text,product_title_clean,product_title_processed
0,【PolarStar】美麗諾羊毛保暖襪『淺灰』P18634,美麗諾羊毛保暖襪淺灰,美麗諾 羊毛 保暖 襪淺 灰
1,甜蜜水晶~天然水晶五行珠手鍊10mm手鍊,甜蜜水晶天然水晶五行珠手鍊手鍊,甜蜜 水晶 天然 水晶 五行 珠手 鍊 手 鍊
2,粉晶六角柱純銀項鍊,粉晶六角柱純銀項鍊,粉晶 六角 柱純 銀項 鍊
3,3M SCOTCH VHB 超強力雙面膠-戶外專用 V1808,超強力雙面膠戶外專用,超強力 雙面 膠戶 外 專用
4,燈專屬優惠 *4盒,燈專屬優惠盒,燈 專屬 優惠盒
5,特價促銷款 NIKE 耐吉 WMNS NIKE INTERNATIONALIST JCRD ...,特價促銷款耐吉運動休閒鞋女,特價 促銷款 耐吉運動 休閒鞋 女
6,Jo Malone 禮盒(附上緞帶),禮盒附上緞帶,禮盒 附上 緞帶
7,【TORO】自然風2.7尺鞋櫃 (HX-926(胡桃) HX927雪白,自然風尺鞋櫃胡桃雪白,自然 風尺 鞋櫃 胡桃 雪白
8,G.P 涼鞋 黑紅 男款 no686,涼鞋黑紅男款,涼鞋 黑紅 男款
9,【紐西蘭 Olive】橄欖嬰兒天然按摩油 100ml,紐西蘭橄欖嬰兒天然按摩油,紐西蘭 橄欖 嬰兒 天然 按摩 油


In [32]:
test_tr_sents_all = []
for i in range(len(test_set['product_title_processed'])):
    try: test_tr_sents_all.append(translate(test_set['product_title_processed'][i]))
    except: test_tr_sents_all.append(np.nan)
        

In [33]:
test_tr_sents_all

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'temperament fresh practical hair band hair accessories hair hoop',
 nan,
 nan,
 nan,
 nan,
 'apollo safe iron cabinet gold treasury library wealth lucky essential original warranty',
 nan,
 'streamlight survivor led flashlight',
 nan,
 nan,
 nan,
 nan,
 'charcoal pit drawing artist yamamoto the guard dvd',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'alfastar bluetooth sports headset',
 nan,
 nan,
 nan,
 'temperament fresh practical hair band hair accessories hair hoop',
 nan,
 nan,
 nan,
 'desk bay single feet double sided cloth breathable mattress magic baby',
 nan,
 nan,
 nan,
 'nekopara vol mobile phone shell sony x xa ultra xzs xz premium',
 'caco splicing petticoat university t female',
 nan,
 nan,
 nan,
 'htc phone case samsung case',
 'kimo outdoor casual shoes black women',
 'liyo v collar lace penetration',
 nan,
 nan,
 'phyto hair flower plant shampoo paris d',
 nan,


In [34]:
test_set['prediction'] = test_tr_sents_all

In [35]:
test_set

Unnamed: 0,text,product_title_clean,product_title_processed,prediction
0,【PolarStar】美麗諾羊毛保暖襪『淺灰』P18634,美麗諾羊毛保暖襪淺灰,美麗諾 羊毛 保暖 襪淺 灰,
1,甜蜜水晶~天然水晶五行珠手鍊10mm手鍊,甜蜜水晶天然水晶五行珠手鍊手鍊,甜蜜 水晶 天然 水晶 五行 珠手 鍊 手 鍊,
2,粉晶六角柱純銀項鍊,粉晶六角柱純銀項鍊,粉晶 六角 柱純 銀項 鍊,
3,3M SCOTCH VHB 超強力雙面膠-戶外專用 V1808,超強力雙面膠戶外專用,超強力 雙面 膠戶 外 專用,
4,燈專屬優惠 *4盒,燈專屬優惠盒,燈 專屬 優惠盒,
5,特價促銷款 NIKE 耐吉 WMNS NIKE INTERNATIONALIST JCRD ...,特價促銷款耐吉運動休閒鞋女,特價 促銷款 耐吉運動 休閒鞋 女,
6,Jo Malone 禮盒(附上緞帶),禮盒附上緞帶,禮盒 附上 緞帶,
7,【TORO】自然風2.7尺鞋櫃 (HX-926(胡桃) HX927雪白,自然風尺鞋櫃胡桃雪白,自然 風尺 鞋櫃 胡桃 雪白,
8,G.P 涼鞋 黑紅 男款 no686,涼鞋黑紅男款,涼鞋 黑紅 男款,
9,【紐西蘭 Olive】橄欖嬰兒天然按摩油 100ml,紐西蘭橄欖嬰兒天然按摩油,紐西蘭 橄欖 嬰兒 天然 按摩 油,


In [36]:
test_set.isna().sum()

text                          0
product_title_clean          72
product_title_processed      72
prediction                 8606
dtype: int64

In [37]:
test_set.dropna(inplace=True)
test_set

Unnamed: 0,text,product_title_clean,product_title_processed,prediction
14,時尚氣質清新實用髮帶 髮飾 髮箍94,時尚氣質清新實用髮帶髮飾髮箍,時尚 氣質 清新 實用 髮 帶 髮 飾 髮 箍,temperament fresh practical hair band hair acc...
19,阿波羅保險箱 保險櫃 鐵櫃 金庫 財庫 財神 招財必備 公司原廠保固 600ALD,阿波羅保險箱保險櫃鐵櫃金庫財庫財神招財必備公司原廠保固,阿波羅 保險箱 保險 櫃鐵櫃 金庫 財庫 財神 招財 必備 公司 原廠 保固,apollo safe iron cabinet gold treasury library...
21,國際牌 CR-1632/1B,國際牌,國際牌,streamlight survivor led flashlight
26,Abercrombie & Fitch A&F 經典logo刺繡連帽外套,經典刺繡連帽外套,經典 刺 繡 連帽 外套,charcoal pit drawing artist yamamoto the guard...
38,GOR ASUS ZenFone 3 2 Deluxe Ultra Zoon Max Las...,玻璃保護貼,玻璃 保護 貼,alfastar bluetooth sports headset
42,時尚氣質清新實用髮帶 髮飾 髮箍51,時尚氣質清新實用髮帶髮飾髮箍,時尚 氣質 清新 實用 髮 帶 髮 飾 髮 箍,temperament fresh practical hair band hair acc...
46,台灣製雙人5x6尺透氣床墊 魔法baby~u3104,台灣製雙人尺透氣床墊魔法,台 灣 製 雙 人 尺 透 氣 床 墊 魔法,desk bay single feet double sided cloth breath...
50,NIKE AIR MAX 98 TRIPLE WHITE 皮革 網布 640744-106,皮革網布,皮革 網布,nekopara vol mobile phone shell sony x xa ultr...
51,美津浓 Mizuno MAXIMIZER 21 慢跑鞋 黑色 男鞋 K1GA190209 n...,美津浓慢跑鞋黑色男鞋,美津浓 慢跑鞋 黑色 男鞋,caco splicing petticoat university t female
55,創意 iPhone8 iPhone7 Plus i7 i8 手機殼 掛繩 全包防摔 PH01214,創意手機殼掛繩全包防摔,創意手 機殼 掛繩 全包防 摔,htc phone case samsung case


In [38]:
test_set.to_csv('./output/1_NMT_test_output1.csv', index=False, encoding='utf_8_sig')

Translate DEV set for BLEU Score Evaluation

In [39]:
tcn_clean

Unnamed: 0,text,split,product_title_clean,product_title_processed
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,private,手机壳软壳挂绳壳大眼兔硅胶壳,手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳
1,SOFT 99 鍍膜車蠟(強力撥水型),private,鍍膜車蠟強力撥水型,鍍膜車 蠟 強力 撥水型
2,低糖芒果乾 250g 臻御行,private,低糖芒果乾臻御行,低糖 芒果乾 臻 御行
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,private,小徑文化日本進口,小徑 文化 日本 進口
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,private,凱蒂貓涼鞋童鞋白紅色小童,凱蒂貓 涼鞋 童鞋 白紅色 小童
5,雷鳥 LT-234 日式木紋桌墊 45 x 60 cm / 片,private,雷鳥日式木紋桌墊片,雷鳥 日式 木紋 桌墊片
6,狂砂小子DVD VOL-08,private,狂砂小子,狂砂 小子
7,ANACOMDA 巨蟒 T1 泰坦系列 120GB SSD固態硬碟,private,巨蟒泰坦系列固態硬碟,巨蟒 泰坦 系列 固態 硬碟
8,iFairies 單肩斜背包側背包肩背包【49146】,private,單肩斜背包側背包肩背包,單肩 斜 背包 側 背包 肩 背包
9,大雪山農場 山苦瓜茶20包/盒,private,大雪山農場山苦瓜茶包盒,大雪山 農場 山 苦瓜 茶 包盒


In [40]:
dev_tr_sents_all = []
for i in range(len(tcn_clean['product_title_processed'])):
    try: dev_tr_sents_all.append(translate(tcn_clean['product_title_processed'][i]))
    except: dev_tr_sents_all.append(np.nan)
        

In [41]:
tcn_clean['prediction']= dev_tr_sents_all

In [42]:
tcn_clean.drop('split', axis=1, inplace = True)
tcn_clean

Unnamed: 0,text,product_title_clean,product_title_processed,prediction
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,手机壳软壳挂绳壳大眼兔硅胶壳,手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳,oppo phone case soft rabbit silicone case
1,SOFT 99 鍍膜車蠟(強力撥水型),鍍膜車蠟強力撥水型,鍍膜車 蠟 強力 撥水型,soft coating car wax strong water watt
2,低糖芒果乾 250g 臻御行,低糖芒果乾臻御行,低糖 芒果乾 臻 御行,low sugar mango dry be the royal
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,小徑文化日本進口,小徑 文化 日本 進口,the culture japan imported round top space cra...
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,凱蒂貓涼鞋童鞋白紅色小童,凱蒂貓 涼鞋 童鞋 白紅色 小童,hello kitty sandals shoes white red children
5,雷鳥 LT-234 日式木紋桌墊 45 x 60 cm / 片,雷鳥日式木紋桌墊片,雷鳥 日式 木紋 桌墊片,thunderbird lt japanese wood grain table mat x...
6,狂砂小子DVD VOL-08,狂砂小子,狂砂 小子,the sand kid dvd vol
7,ANACOMDA 巨蟒 T1 泰坦系列 120GB SSD固態硬碟,巨蟒泰坦系列固態硬碟,巨蟒 泰坦 系列 固態 硬碟,anacomda python titan series ssd drive
8,iFairies 單肩斜背包側背包肩背包【49146】,單肩斜背包側背包肩背包,單肩 斜 背包 側 背包 肩 背包,ifairies shoulder messenger bag shoulder bag
9,大雪山農場 山苦瓜茶20包/盒,大雪山農場山苦瓜茶包盒,大雪山 農場 山 苦瓜 茶 包盒,the farm mountain bitter gourd tea pack box


In [43]:
tcn_clean['prediction'] = tcn_clean['prediction'].replace(r'^\s*$', np.NaN, regex=True) #removes empty strings or strings containing only spaces with NaN

In [44]:
tcn_clean.isna().sum()

text                        0
product_title_clean         6
product_title_processed     6
prediction                 10
dtype: int64

In [45]:
tcn_clean.loc[pd.isna(tcn_clean).any(1), :].index #check nan after translated rows indexes

Int64Index([260, 396, 505, 568, 593, 597, 626, 654, 820, 904], dtype='int64')

In [46]:
tr_nan_indexes = tcn_clean.loc[pd.isna(tcn_clean).any(1), :].index #stores nan indexes for later dev_en follow to drop nan rows as well
tr_nan_indexes

Int64Index([260, 396, 505, 568, 593, 597, 626, 654, 820, 904], dtype='int64')

In [47]:
tcn_clean = tcn_clean.dropna()
tcn_clean.isna().sum()

text                       0
product_title_clean        0
product_title_processed    0
prediction                 0
dtype: int64

In [48]:
tcn_clean.to_csv('./output/1_NMT_dev_output1.csv',index = False, encoding='utf_8_sig')

In [49]:
dev_tr_sents_all_df = pd.read_csv('./output/1_NMT_dev_output1.csv')

In [50]:
dev_tr_sents_all_df

Unnamed: 0,text,product_title_clean,product_title_processed,prediction
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,手机壳软壳挂绳壳大眼兔硅胶壳,手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳,oppo phone case soft rabbit silicone case
1,SOFT 99 鍍膜車蠟(強力撥水型),鍍膜車蠟強力撥水型,鍍膜車 蠟 強力 撥水型,soft coating car wax strong water watt
2,低糖芒果乾 250g 臻御行,低糖芒果乾臻御行,低糖 芒果乾 臻 御行,low sugar mango dry be the royal
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,小徑文化日本進口,小徑 文化 日本 進口,the culture japan imported round top space cra...
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,凱蒂貓涼鞋童鞋白紅色小童,凱蒂貓 涼鞋 童鞋 白紅色 小童,hello kitty sandals shoes white red children
5,雷鳥 LT-234 日式木紋桌墊 45 x 60 cm / 片,雷鳥日式木紋桌墊片,雷鳥 日式 木紋 桌墊片,thunderbird lt japanese wood grain table mat x...
6,狂砂小子DVD VOL-08,狂砂小子,狂砂 小子,the sand kid dvd vol
7,ANACOMDA 巨蟒 T1 泰坦系列 120GB SSD固態硬碟,巨蟒泰坦系列固態硬碟,巨蟒 泰坦 系列 固態 硬碟,anacomda python titan series ssd drive
8,iFairies 單肩斜背包側背包肩背包【49146】,單肩斜背包側背包肩背包,單肩 斜 背包 側 背包 肩 背包,ifairies shoulder messenger bag shoulder bag
9,大雪山農場 山苦瓜茶20包/盒,大雪山農場山苦瓜茶包盒,大雪山 農場 山 苦瓜 茶 包盒,the farm mountain bitter gourd tea pack box


In [51]:
# !pip install sacrebleu

In [52]:
import sacrebleu

In [53]:
dev_en_sents = en_clean.drop(tr_nan_indexes) # dropping same set of indexes as dev_tr for bleu scorer to match

In [54]:
refs = [dev_en_sents['translation_output']]
sys = dev_tr_sents_all_df['prediction']
bleu = sacrebleu.corpus_bleu(sys, refs, lowercase=True)
print(bleu.precisions)

[95.45272167981811, 95.28287343918606, 95.27014735310169, 95.30245956126745]
