In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import unicodedata
import re
import numpy as np
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split

from pylab import *
from matplotlib.font_manager import FontProperties
print('tf.__version__.is', tf.__version__)
print('tf.keras.__version__ is:', tf.keras.__version__)
print(tf.config.experimental.list_physical_devices('GPU'))

tf.__version__.is 2.5.0
tf.keras.__version__ is: 2.5.0
[]


Dataset loading

In [3]:
tcn_clean = pd.read_csv('./preprocessed/cleaned_dev_tcn.csv')
tcn_clean

Unnamed: 0,text,split,product_title_clean,product_title_processed
0,OPPO A75 A75s A73 手机壳 软壳 挂绳壳 大眼兔硅胶壳,private,手机壳软壳挂绳壳大眼兔硅胶壳,手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳
1,SOFT 99 鍍膜車蠟(強力撥水型),private,鍍膜車蠟強力撥水型,鍍膜車 蠟 強力 撥水型
2,低糖芒果乾 250g 臻御行,private,低糖芒果乾臻御行,低糖 芒果乾 臻 御行
3,＊小徑文化＊日本進口ROUND TOP space craft - diamond (SC-...,private,小徑文化日本進口,小徑 文化 日本 進口
4,Hello Kitty 凱蒂貓 KITTY 涼鞋 童鞋 白/紅色 小童 no739,private,凱蒂貓涼鞋童鞋白紅色小童,凱蒂貓 涼鞋 童鞋 白紅色 小童
...,...,...,...,...
995,【HippoRed】撕破乐趣★独特风格★中直筒牛仔裤 O591_445,private,撕破乐趣独特风格中直筒牛仔裤,撕破 乐趣 独特 风格 中直 筒 牛仔裤
996,兒童套裝 台灣製薄長袖居家套裝 魔法Baby~k60092,private,兒童套裝台灣製薄長袖居家套裝魔法,兒童 套 裝台 灣 製 薄 長 袖 居家 套裝 魔法
997,LONGCHAMP Le Pliage Neo高密尼龍後背包(中型),private,高密尼龍後背包中型,高密 尼龍 後 背包 中型
998,iFairies 開口可調節戒指★ifairies【56472】【56472】,private,開口可調節戒指,開口 可 調節 戒指


In [4]:
en_clean = pd.read_csv('./preprocessed/cleaned_dev_en.csv')
en_clean

Unnamed: 0,translation_output
0,oppo phone case soft rabbit silicone case
1,soft coating car wax strong water watt
2,low sugar mango dry be the royal
3,the culture japan imported round top space cra...
4,hello kitty sandals shoes white red children
...,...
995,hippored torn fun unique style straight jeans
996,kids set table bay thin long sleeve home suit ...
997,longchamp le pliage neo high density nylon bac...
998,ifairies opening adjustable ring ifairies


Preprocessing (change from unicode to ascii & add \<start\> and \<end\> tokens

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [6]:
def preprocess(w):
    w = unicode_to_ascii(w.lower().strip())

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    # w = '<start> ' + w + ' <end>'
    return w

In [7]:
# u means unicode encoder
en_sentence = en_clean['translation_output'][0]
chn_sentence = tcn_clean['product_title_processed'][0]
print(preprocess(en_sentence))
print(preprocess(chn_sentence))
print(preprocess(chn_sentence).encode('utf-8'))

oppo phone case soft rabbit silicone case
手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳
b'\xe6\x89\x8b\xe6\x9c\xba \xe5\xa3\xb3 \xe8\xbd\xaf\xe5\xa3\xb3 \xe6\x8c\x82\xe7\xbb\xb3 \xe5\xa3\xb3 \xe5\xa4\xa7\xe7\x9c\xbc \xe5\x85\x94 \xe7\xa1\x85\xe8\x83\xb6 \xe5\xa3\xb3'


Prepare Embedding Layer

In [8]:
def create_dataset(target, source):
    assert len(source) == len(target)
    word_pairs = [[preprocess(str(target[i])), preprocess(str(source[i]))]
                  for i in range(len(source))]

    # return two tuple: one tuple includes all English sentenses, and 
    # another tuple includes all Chinese sentenses
    return word_pairs

word_pairs = create_dataset(en_clean['translation_output'], tcn_clean['product_title_processed'])
word_pairs[:20]

[['oppo phone case soft rabbit silicone case', '手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳'],
 ['soft coating car wax strong water watt', '鍍膜車 蠟 強力 撥水型'],
 ['low sugar mango dry be the royal', '低糖 芒果乾 臻 御行'],
 ['the culture japan imported round top space craft diamond sc mk',
  '小徑 文化 日本 進口'],
 ['hello kitty sandals shoes white red children', '凱蒂貓 涼鞋 童鞋 白紅色 小童'],
 ['thunderbird lt japanese wood grain table mat x cm pc', '雷鳥 日式 木紋 桌墊片'],
 ['the sand kid dvd vol', '狂砂 小子'],
 ['anacomda python titan series ssd drive', '巨蟒 泰坦 系列 固態 硬碟'],
 ['ifairies shoulder messenger bag shoulder bag', '單肩 斜 背包 側 背包 肩 背包'],
 ['the farm mountain bitter gourd tea pack box', '大雪山 農場 山 苦瓜 茶 包盒'],
 ['colgate dual clean toothbrush', '高露潔 雙效潔淨 牙刷 單支'],
 ['apieu color lip pencil matt silk satin', '絲緞'],
 ['reese s away milan rose gold ball pen', '芮菲 客米蘭 玫瑰 金鋼 珠筆'],
 ['finding nemo glass magnet dolly _ the beast', '海底 總動員 玻璃 磁鐵多莉款 野獸國'],
 ['natural licorice oval pumpkin seeds', '甘草 瓜子 南瓜子'],
 ['papago hd driving recorder', '行車 記錄器'],

In [9]:
en, chn = zip(*create_dataset(en_clean['translation_output'], tcn_clean['product_title_processed']))
print(en[-1])
print(chn[-1])
# show the size of the dataset
assert len(en) == len(chn)
print("Size:", len(en))

polarstar women sweat quick dry t shirt black
女排 汗 快干 恤 黑
Size: 1000


download and import pretrained tokenizer

In [10]:
#from transformers import BertTokenizer
#from transformers import BertModel

#from tokenizers import BertWordPieceTokenizer
#from transformers import BertTokenizer

In [11]:
#disable warning
import logging
logging.basicConfig(level=logging.ERROR)

In [12]:
from transformers import TFAutoModelWithLMHead, TFXLMRobertaForMaskedLM, TFXLMRobertaForTokenClassification, AutoTokenizer

In [13]:
#test cell

lang_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
type(lang_tokenizer)

transformers.tokenization_xlm_roberta.XLMRobertaTokenizer

In [14]:
#test cell

lang_tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [15]:
#test cell

print(lang_tokenizer.convert_tokens_to_ids('<s>'))
print(lang_tokenizer.convert_tokens_to_ids('</s>'))
print(lang_tokenizer.convert_tokens_to_ids('<unk>'))
print(lang_tokenizer.convert_tokens_to_ids('<pad>'))
print(lang_tokenizer.convert_tokens_to_ids('<mask>'))

0
2
3
1
250001


In [16]:
MAX_LEN = 16

def tokenize(lang):
    lang_tokenizer =  AutoTokenizer.from_pretrained('xlm-roberta-base')   
    # generate a dictionary, e.g. word -> index(of the dictionary)
    

    tensor = lang_tokenizer.encode_plus(
            lang,                
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt',
            truncation=True)['input_ids']
    
#    print(type(tensor))
#    print(tensor)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    return tensor#, lang_tokenizer

In [17]:
def load_dataset(target, source):
    # creating cleaned input, output pairs
    # regard Chinese as source sentence, regard English as target sentence
    targ_lang, inp_lang = zip(*create_dataset(target, source))

    input_tensor = pd.Series(inp_lang).apply(lambda x: tokenize(x))
    target_tensor = pd.Series(targ_lang).apply(lambda x: tokenize(x))

    return input_tensor, target_tensor

In [19]:
def max_length(tensor):
    # padding the sentence to max_length
    return max(len(t) for t in tensor)

In [20]:
%%time

input_tensor, target_tensor = load_dataset(en_clean['translation_output'][:21],
                                           tcn_clean['product_title_processed'][:21])

print(len(input_tensor))
print(len(target_tensor))



21
21
CPU times: user 33.5 s, sys: 764 ms, total: 34.2 s
Wall time: 2min 51s


In [21]:
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(
    target_tensor), max_length(input_tensor)

# Creating training and validation sets using an 95-5 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.1)

# Show length of the training data and validation data
print("# training data: {:d}\n# test data: {:d}".format(len(input_tensor_train), len(input_tensor_val)))

# training data: 18
# test data: 3


In [22]:
#test cell

print(lang_tokenizer('銳'))
lang_tokenizer.decode([189250])

{'input_ids': [0, 6, 189250, 2], 'attention_mask': [1, 1, 1, 1]}


'銳'

In [23]:
def convert(lang, tensor):
    for t in pd.Series(tensor):
        if t != 0:
            print("%d ----> %s" % (t, lang.decode(t)))

print("Input Language; index to word mapping")
print(input_tensor_train[0][0])
convert(lang_tokenizer, input_tensor_train[3][0])

print("Target Language; index to word mapping")
print(target_tensor_train[0][0])
convert(lang_tokenizer, target_tensor_train[3][0])

Input Language; index to word mapping
[     0      6  14754      6 204286      6  60772 204286      6  60771
 210428      6 204286  54553   5003      2]
57960 ----> 小
80394 ----> 徑
6 ----> 
4278 ----> 文化
34891 ----> 日本
6 ----> 
113416 ----> 進口
2 ----> </s>
1 ----> <pad>
1 ----> <pad>
1 ----> <pad>
1 ----> <pad>
1 ----> <pad>
1 ----> <pad>
1 ----> <pad>
Target Language; index to word mapping
[     0   2343     31  24089   7225  32977 152131     18 127276   7225
      2      1      1      1      1      1]
70 ----> the
29394 ----> culture
77752 ----> japan
76242 ----> importe
71 ----> d
68807 ----> round
2663 ----> top
32628 ----> space
131346 ----> craft
879 ----> dia
15882 ----> mond
9023 ----> sc
6 ----> 
19201 ----> mk
2 ----> </s>


Initialise Parameters 

In [24]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 2
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
# 0 is a reserved index that won't be assigned to any word, so the size of vocabulary should add 1
vocab_size = len(lang_tokenizer) + 1
#vocab_tar_size = len(targ_lang) + 1

input_tensor_train_reshape = np.array(input_tensor_train.apply(lambda x: x[0]).tolist())
target_tensor_train_reshape = np.array(input_tensor_train.apply(lambda x: x[0]).tolist())


dataset = tf.data.Dataset.from_tensor_slices(
    (list(input_tensor_train_reshape), list(target_tensor_train_reshape))).shuffle(BUFFER_SIZE)

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

<BatchDataset shapes: ((2, 16), (2, 16)), types: (tf.int32, tf.int32)>


(TensorShape([2, 16]), TensorShape([2, 16]))

In [25]:
print('vocab_size =', vocab_size)

vocab_size = 250003


Encoder class construction

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        
        output, state = self.gru(x, initial_state=hidden)
        
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [27]:
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

# the output and the hidden state of GRU is equal
print(sample_output[-1, -1, :] == sample_hidden[-1, :])

Encoder output shape: (batch size, sequence length, units) (2, 16, 1024)
Encoder Hidden state shape: (batch size, units) (2, 1024)
tf.Tensor([ True  True  True ...  True  True  True], shape=(1024,), dtype=bool)


Define Attention layer

In [28]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


In [29]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (2, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (2, 16, 1)


Decoder class construction

In [30]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

In [31]:
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)
print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (2, 250003)


Define optimizer and loss function

In [32]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    """Calculate the loss value
    Args:
        real: the true label  shape == (batch_size,) -> (128,)
        pred: the probability of each word from the vocabulary, is the output from the decoder 
                 shape == (batch_size, vocab_size) -> (128, 6082)

    Returns: 
        the average loss of the data in a batch size
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

Checkpoints saving

In [33]:
checkpoint_dir = './checkpoints/tcn-eng-bert'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

Define Training

In [34]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden
        
        # feed the <start> as the first input of the decoder
        # dec input shape == (batch_size, 1) -> (128, 1)
        dec_input = tf.expand_dims([lang_tokenizer.convert_tokens_to_ids('<s>')] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        # because of the data preprocessing(add a start token to the sentence)
        # the first word is <start>, so t starts from 1(not 0)
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            # targ[:, t] is the true label(index of the word) of every sentence(in a batch) 
            # at the current timestamp
            # like [  85   18   25   25  ···  1047   79   13], shape == (batch_size,) -> (128,)
            # predictions shape == (batch_size, vocab_size) -> (128, 6082)
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    # collect all trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables

    # calculate the gradients for the whole variables
    gradients = tape.gradient(loss, variables)

    # apply the gradients on the variables
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
    start = time.time()

    # get the initial hidden state of gru
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 11.6524


Evaluation (Translation)

In [64]:
def evaluate(sentence):
    """Translate a sentence
    Args:
        sentence: the test sentence        
    """
    
    # max_length_targ 38, max_length_inp 64
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess(sentence)

    # convert each word to the index in the test sentence
    inputs = [lang_tokenizer.convert_tokens_to_ids(i) for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    # hidden shape == (1, 1024)
    hidden = [tf.zeros((1, units))]
    
    # enc out shape == (1, max_length_inp, 1024) -> (1, 18, 1024)
    # enc hidden shape == (1, 1024)
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([lang_tokenizer.convert_tokens_to_ids('<s>')], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weigths to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        # get the index which has the highest probability
        predicted_id = tf.argmax(predictions[0]).numpy()
        #  convert the index to the word
        print(predicted_id)
        print(type(predicted_id))
        result += lang_tokenizer.convert_ids_to_tokens(int(predicted_id)) + ' '

        # when the decoder predicts the end, stop prediction
        if lang_tokenizer.convert_ids_to_tokens(int(predicted_id)) == '</s>':
            return result, sentence, attention_plot

        # the predicted id is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


def translate(sentence):
    result, sentence, attention_plot= evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))


Restore the latest checkpoint and test

In [48]:
checkpoint_dir = './checkpoints/tcn-eng-bert'
print(tf.train.latest_checkpoint(checkpoint_dir))

# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

./checkpoints/tcn-eng/ckpt-1


ValueError: Shapes (250003, 256) and (2494, 256) are incompatible

In [38]:
test_clean = pd.read_csv('cleaned_test_tcn.csv')
test_clean

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_test_tcn.csv'

In [52]:
tcn_clean['product_title_processed'][0]

'手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳'

In [65]:
translate(tcn_clean['product_title_processed'][0])

6
<class 'numpy.int64'>
Input: 手机 壳 软壳 挂绳 壳 大眼 兔 硅胶 壳
Predicted translation: ▁ 


In [72]:
translate(tcn_clean['product_title_processed'][40])

6
<class 'numpy.int64'>
Input: 溫馨 小 舖 英雄 鋼筆 專用 墨水 管入 包
Predicted translation: ▁ 
