# Implementation of translation  
- English -> Chinese

In [1]:
!cat /etc/system-release

CentOS Linux release 7.6.1810 (Core) 


In [2]:
!pwd

/root/shared/workspace/Dev/translate


In [3]:
!ls Dataset

_bak		       glove.6B.50d_gensim.txt	   sgns.baidubaike.bigram-char
enW2v_cleaned_weights  glove.6B.50d_gensim.txt.gz
enW2v_idx2Lang	       original


##### hypter parameters

In [39]:
import re
import jieba
import pandas as pd
import tensorflow as tf
import time
import traceback
import numpy as np
from copy import deepcopy
from collections import Counter
import os
import urllib
import json
import requests
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split

In [23]:
datasetPath = r'/root/shared/workspace/Dev/translate/Dataset'
enPath = os.path.join(datasetPath, r'original/en-zh/train.tags.en-zh.en')
chPath = os.path.join(datasetPath, r'original/en-zh/train.tags.en-zh.zh')

enW2VPath = os.path.join(datasetPath, r'glove.6B.50d_gensim.txt.gz')
chW2VPath = os.path.join(datasetPath, r'sgns.baidubaike.bigram-char.bz2')

'''model parameters'''

maxSenLen = 20    # terms + one <tag>
hiddenUnitNum = 64    # ref: 1, 2 1000 -> 300

# enVocabularySize = 10000    # 51427 -> 10000
# chVocabularySize = 4000    # 88537 -> 4000
# embeddingSize = 50

# make use of the trained embeddings
enVocabularySize = 400000
chVocabularySize = 4000
enEmbSize = 50
chEmbSize = 300

energySize = 10

batchSize = 100
datasetSize = 200000    # not really the full size
stepsPerEpoch = int(datasetSize / batchSize)

# bufferSize = 30000
# prefetchSize = 1000
# seed = 1

checkpointDir = r'/root/shared/workspace/Dev/translate/checkpoints'
# checkpointPath = os.path.join(checkpointDir, 'translator.ckpt')

In [6]:
import logging
from logging import getLogger, Formatter
from logging import handlers, StreamHandler

class Logger():
    @classmethod
    def get_logger(self, name = 'test', persist = False):
        
        lg = logging.root.manager.loggerDict.get(name)
        if type(lg) == type(None):
            lg = getLogger(name)    # ref: 23
            lg.setLevel('DEBUG')

            fmt = Formatter('%(name)s %(asctime)s [%(levelname)s] - %(filename)s:%(lineno)s %(message)s')

            shandler = StreamHandler()
            shandler.setFormatter(fmt)
            lg.addHandler(shandler)

            if persist:
                fhandler = handlers.RotatingFileHandler('./' + name + '.log', mode = 'a', maxBytes = 1024 * 1024, backupCount = 3)
                fhandler.setFormatter(fmt)
                lg.addHandler(fhandler)
        
        return lg

##### Download data

In [None]:
# ! curl -o en-zh.tgz https://wit3.fbk.eu/archive/2015-01//texts/en/zh/en-zh.tgz

In [143]:
# ! tar zxvf en-zh.tgz -C ./

en-zh/
en-zh/IWSLT15.TED.dev2010.en-zh.en.xml
en-zh/IWSLT15.TED.dev2010.en-zh.zh.xml
en-zh/IWSLT15.TED.tst2010.en-zh.en.xml
en-zh/IWSLT15.TED.tst2010.en-zh.zh.xml
en-zh/IWSLT15.TED.tst2011.en-zh.en.xml
en-zh/IWSLT15.TED.tst2011.en-zh.zh.xml
en-zh/IWSLT15.TED.tst2012.en-zh.en.xml
en-zh/IWSLT15.TED.tst2012.en-zh.zh.xml
en-zh/IWSLT15.TED.tst2013.en-zh.en.xml
en-zh/IWSLT15.TED.tst2013.en-zh.zh.xml
en-zh/README
en-zh/train.tags.en-zh.en
en-zh/train.tags.en-zh.zh
en-zh/train.zh


### 0. Prepare the word 2 vector model 

In [7]:
def get_parent_path(curPath):
    parPath = curPath[:curPath.rfind(os.sep)]
    return parPath

class W2V_model_wrapper():
    def __init__(self, w2vModel = None, w2vModelPath = None, vocabulary = None, name = ''):
        self.name = name
        self.w2vModelPath = w2vModelPath
        self.w2vModel = w2vModel or KeyedVectors.load_word2vec_format(w2vModelPath)
        self.vocabulary = vocabulary or ['<pad>', '<sos>', '<eos>', '<unk>']   # this order is important,
        self.idx2Language = {}
        
    def _generate_idx2Language(self):
        self.vocabulary.extend(self.w2vModel.index2word)
        for idx in range(len(self.vocabulary)):
            term = self.vocabulary[idx]
            self.idx2Language[idx] = term
    
    def persist_idx2Language(self):
        persistDir = get_parent_path(self.w2vModelPath)
        persistPath = os.path.join(persistDir, self.name + '_idx2Lang')
    
        self._generate_idx2Language()
        with open(persistPath, 'w', encoding = 'utf-8') as f:
            for idx in self.idx2Language:
                line = '%d\t%s\n'%(idx, self.idx2Language[idx])
                f.write(line)
        
        print('idx2Lang persisted')
        print(persistPath)
    
    def persist_weights(self):
        persistDir = get_parent_path(self.w2vModelPath)
        persistPath = os.path.join(persistDir, self.name + '_cleaned_weights')
        
        np.savetxt(persistPath, self.w2vModel.syn0, fmt = '%g')
        print('cleaned_weights persisted')
        print(persistPath)
        

In [145]:
# enW2v = W2V_model_wrapper(w2vModel = w2vModel, w2vModelPath = enW2VPath, vocabulary = None, name = 'enW2v')
# chW2v = W2V_model_wrapper(w2vModel = chW2vModel, w2vModelPath = chW2VPath, vocabulary = None, name = 'chW2v')

In [146]:
# enW2v.persist_idx2Language()
# enW2v.persist_weights()

idx2Lang persisted




cleaned_weights persisted


In [147]:
# chW2v.persist_idx2Language()
# chW2v.persist_weights()

idx2Lang persisted




cleaned_weights persisted


### 1. Pre-processing  
use **tf.dataset**, functionalities: 
1. clean illegal characters, add space
2. tokenize
3. add **sos** **eos** tags
4. skip line according to pattern
4. indexing  
6. persist maps, processed data   
5. map sentence to tensor  
7. persist training data (indices)  

##### 1.1. Clean and add space

In [8]:
def clean_en(text):
    textTemp = re.sub(r'(?<=\d),\s*(?=\d)', '', text)    # eliminate the digits seperator
    textTemp = re.sub(r'((?<![\d\s])\s*-)|(-\s*(?!\d))', ' ', textTemp)    # substitue the '-' symbol as space, except minus mark
    
    textTemp = re.sub(r'([?!,.])+', r'\1', textTemp)    # eliminate the duplicate punctuations, not good enough?
    textTemp = re.sub(r'([?!,. \+\-\*/\=\%\& \'])', r' \1 ', textTemp)    # ref: link 2, add spaces before and after [?!,.] marks
    textTemp = re.sub(r'[^ a-z0-9A-Z?!,. \+\-\*/\=\%\&]', "", textTemp)
    textTemp = re.sub(r'\s+', ' ', textTemp)
    textCleaned = textTemp.lower().strip()    #  eliminate space on the last position of sentence
    return textCleaned

In [9]:
def clean_ch(text):
    textTemp = re.sub(r'(?<=\d)，\s*(?=\d)', '', text)    # eliminate the digits seperator
    textTemp = re.sub(r'((?<![\d\s])\s*-)|(-\s*(?!\d))', ' ', textTemp)    # substitue the non-minus '-' symbol as space
    
    textTemp = re.sub(r'([？！，。、])+', r'\1', textTemp)    # remove duplication
    textTemp = re.sub(r'([？！，。、 \+\-\*/\=\%\&])', r' \1 ', textTemp)    # add space
    textTemp = re.sub(r'[^ \u4e00-\u9fa5a-z0-9A-Z？！，。、 \+\-\*/\=\%\&]', "", textTemp)    # ref: link 4
    textTemp = re.sub(r'\s+', '', textTemp)    # the space will be treated as a term when using jieba tokenizer, so eliminated here
    textCleaned = textTemp.lower().strip()    #eliminate in the tokenize
    return textCleaned

##### 1.2. Tokenize  
*tokenize firstly is for preventing the added tags being damaged by the tokenizer*

In [10]:
def tokenize_en(text):
    termList = text.split(r' ')
    return termList

In [11]:
def tokenize_ch(text):
    termList = jieba.cut(text)
    return list(termList)

##### 1.3. Add_tag

In [12]:
def add_tag(termList):
    '''
    directly operate the original term list, for saving the memeory
    '''
    termList.insert(0, '<sos>')
    termList.append('<eos>')
    return termList

##### 1.4 Skip line

In [13]:
def skip_line(line):
    skipSignal = True
    skipPattern = '(^<[^\n]*>)|(^\s*$)'    # here may waste some useful training data, like the title
    if len(re.findall(skipPattern, line)) == 0:    # not existing the pattern
        skipSignal = False
    return skipSignal

##### 1.5, 6, 7, 8. Index; Map from sentence to tensor; Persist maps, processed data; Persist training data (indices)

In [14]:
class Preprocessor():
    '''
    creat for each languages.
    process the sentence, and map terms into indcies
    while the language map is created internally, can also be imported from outside
    '''
    
    def __init__(self, clean = None, tokenize = None, add_tag = None, skip_line = None, vocabularySize = None):
        self.clean = clean
        self.tokenize = tokenize
        self.add_tag = add_tag
        self.skip_line = skip_line
        self.vocabularySize = vocabularySize    # only use topK terms to construct the maps | deprecated when load the language model
        
        self.language2Idx = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>':3}    # <sos> <eos> is for distinguishing the same term in the first and last position 
        self.idx2Language = {0: '<pad>', 1: '<sos>', 2: '<eos>', 3: '<unk>'}
        
        self.languageDatasetPath = None
        self.idx2LanguagePath = None    # only persist one map, as duplication
        
    
    def clean_tokenize_addTag(self, sentence):
        '''
        1. eliminate illegal characters
        2. cut up terms
        3. add tags
        '''
        sentenceCleaned = self.clean(sentence)
        termList = self.tokenize(sentenceCleaned)
        termList = add_tag(termList)
        return termList
    
    
    def get_skipped_line_set(self, languageDatasetPath):
        '''
        generate the skip line set, union with the set of target language
        '''
        lineCnt = 0
        skippedLineSet = set()
        self.languageDatasetPath = languageDatasetPath
        
        with open(self.languageDatasetPath, 'r', encoding = 'utf-8') as langData:
            
            for line in langData:
                lineCnt += 1
                if self.skip_line(line):
                    skippedLineSet.add(lineCnt)

        return skippedLineSet
    
    
    def load_language(self, languageDatasetPath = None, persistProcessed = False, skippedLineSet = None):
        '''
        load the language to genereate language map, and
        persistProcessed, whether to persist the intermediate processed dataset
        '''
        self.languageDatasetPath = self.languageDatasetPath or languageDatasetPath    # set the self.languageDatasetPath only when it was not set
        languageDatasetProcessedPath = self.languageDatasetPath + '_processed'
        
        # map paths
        self.idx2LanguagePath = self.languageDatasetPath + '_idx2Lang'
        
        # use the counter to get topK terms for form the maps
        languageCounter = Counter()
        
        # open the target processed dataset file firstly
        if persistProcessed:
            langDataProcessed = open(languageDatasetProcessedPath, 'w', encoding = 'utf-8')
            
        try:
            with open(self.languageDatasetPath, 'r', encoding = 'utf-8') as langData:
                lineCnt = 0
                skippedLineSet = skippedLineSet or set()
                
                for line in langData:

                    # show the progress
                    lineCnt += 1
                    if lineCnt % 30000 == 0:
                        # break    # TEST
                        print(lineCnt)

                    if not lineCnt in skippedLineSet:
                        termList = self.clean_tokenize_addTag(line.strip())

                        # fill the language maps
                        for term in termList: 
                            languageCounter.update([term])    # update with the term, instead of the characters

                            # TODO: check what leads to the empty term
                            if term == '':
                                print(line)


                        # if persist the intermediate procedded data
                        if persistProcessed:
                            # TODO: add log
                            sentenceProcessed = ' '.join(termList) + '\n'
                            langDataProcessed.write(sentenceProcessed)
                
            # add the most common terms into map
            languageCounter.pop('<sos>')
            languageCounter.pop('<eos>')
            for term, freq in languageCounter.most_common(self.vocabularySize):             
                idx = len(self.language2Idx)    # as count from 0
                self.language2Idx[term] = idx
                self.idx2Language[idx] = term
                                
                                
        except Exception as e:
            print(e)
            
        finally:
            if persistProcessed:    # so that the langDataProcessed will not be used before define
                langDataProcessed.close()
            
            
    def persist_languange_map(self):
        '''
        save the generated language map
        '''
        if type(self.idx2LanguagePath) != type(None):
            with open(self.idx2LanguagePath, 'w', encoding = 'utf-8') as f:
                for idx in self.idx2Language:
                    line = '%d\t%s\n'%(idx, self.idx2Language[idx])
                    f.write(line)
        else:
            print('the idx2Language map is not assigned yet, i.e. the idx2Language map is not existing yet')
            
        # TODO: add log
            
                            
        
    def load_language_maps(self, idx2LanguagePath):
        '''
        use an existing map
        '''
        self.idx2LanguagePath = idx2LanguagePath
        
        with open(self.idx2LanguagePath, 'r', encoding = 'utf-8') as f:
            for line in f:
                idxTermPair = line.strip().split('\t')
                if len(idxTermPair) > 1:
                    idx = idxTermPair[0]
                    term = idxTermPair[1]
                    self.language2Idx[term] = int(idx)    # NOTE: aware the index type is int
                    self.idx2Language[int(idx)] = term
                else:
                    print(idxTermPair)    # if there exists empty term
    
    
    '''
    methods belog are making use of the generated/loaded map
    '''
    def index_term(self, term):
        '''
        transform term into index
        '''
        return self.language2Idx.get(term) or 3    # if the term is not seen before, return the index of <unk> tag
    
    
    def index_raw_sentence(self, rawSentence):
        '''
        transform sentence into indices
        - require methods: clean, tokenize, add_tag, skip_line
        - for tf.dataset to apply on raw sentence retrive the tensor
        '''
        termList = self.clean_tokenize_addTag(rawSentence)
        termIdxList = list(map(self.index_term, termList))
        return termIdxList
    
    
    def index_persist_processed_data(self, languageDatasetProcessedPath):
        '''
        index the processed data set and persist as the training dataset (indices)
        not necessarily the language data set that used to create the map
        
        when processing the raw data set, use tf.dataset api to apply index_sentence on the raw sentences
        '''
        senLenList = []
        with open(languageDatasetProcessedPath, 'r', encoding = 'utf-8') as f:
            with open(languageDatasetProcessedPath + '_index', 'w', encoding = 'utf-8') as h:
                for line in f:
                    termList = line.strip().split(' ')
                    idxList = list(map(str, map(self.index_term, termList)))    # transform index into int format
                    h.write(' '.join(idxList) + '\n')
        
                    senLenList.append(len(termList))
        
        maxSenLen = max(senLenList)
        print('tensor max length: %d'%maxSenLen)
        return senLenList
        # TODO: add log
        
        
    def retrive_terms(self, idxList):
        termList = []
        for idx in idxList:
            termList.append(self.idx2Language[idx])
        return ' '.join(termList)

##### 1.9. Run preprocessing

In [15]:
# %%time
# # load the original language data set into preprocessor to generate the language maps and intermediate processed (tokenized) data
# pEn = Preprocessor(clean_en, tokenize_en, add_tag, skip_line, enVocabularySize)
# pCh = Preprocessor(clean_ch, tokenize_ch, add_tag, skip_line, chVocabularySize)

# enSkippedLineSet = pEn.get_skipped_line_set(enPath)
# chSkippedLineSet = pCh.get_skipped_line_set(chPath)

# skippedLineSet = enSkippedLineSet.union(chSkippedLineSet)    # for align the source dataset and target dataset

# pEn.load_language(persistProcessed = True, skippedLineSet = skippedLineSet)
# pCh.load_language(persistProcessed = True, skippedLineSet = skippedLineSet)

# pEn.persist_languange_map()
# pCh.persist_languange_map()

In [16]:
# pEn.index_raw_sentence('This is a good day dssd')

In [15]:
!ls {datasetPath}

_bak		       glove.6B.50d_gensim.txt	   sgns.baidubaike.bigram-char
enW2v_cleaned_weights  glove.6B.50d_gensim.txt.gz
enW2v_idx2Lang	       original


In [23]:
%%time
# load the language maps and generate the index of language datasets
pEn2 = Preprocessor()
pCh2 = Preprocessor()

pEn2.load_language_maps(os.path.join(datasetPath, 'enW2v_idx2Lang'))    # here make use of the maps from trained embeddings
# pCh2.load_language_maps(os.path.join(datasetPath, 'chW2v_idx2Lang'))
pCh2.load_language_maps(chPath + '_idx2Lang')    # use the original Chinese model

print(len(pEn2.idx2Language))
print(len(pCh2.idx2Language))

enSenLenList = pEn2.index_persist_processed_data(enPath + '_processed')
chSenLenList = pCh2.index_persist_processed_data(chPath + '_processed')

400004
4004
tensor max length: 616
tensor max length: 600
CPU times: user 11.7 s, sys: 3.63 s, total: 15.3 s
Wall time: 16.1 s


In [104]:
# # set the padding length as 50, so that over 95% sentences are coverd
# enSenDF = pd.DataFrame(enSenLenList)
# chSenDF = pd.DataFrame(chSenLenList)
# print(enSenDF.quantile(0.95))
# print(chSenDF.quantile(0.95))

0    49.0
Name: 0.95, dtype: float64
0    43.0
Name: 0.95, dtype: float64


##### Split the training / testing set

In [55]:
enL = []
with open(enPath + '_processed_index', 'r', encoding = 'utf-8') as enF:
    for line in enF:
        enL.append(line)
        
chL = []
with open(chPath + '_processed_index', 'r', encoding = 'utf-8') as chF:
    for line in chF:
        chL.append(line)

In [56]:
enTrain, enTest, chTrain, chTest = train_test_split(enL, chL, test_size=0.01, random_state=1)

In [65]:
with open(enPath + '_processed_index_train', 'w', encoding = 'utf-8') as enF:
    for line in enTrain:
        enF.write(line)
        
with open(enPath + '_processed_index_test', 'w', encoding = 'utf-8') as enF:
    for line in enTest:
        enF.write(line)

In [66]:
with open(chPath + '_processed_index_train', 'w', encoding = 'utf-8') as chF:
    for line in chTrain:
        chF.write(line)
        
with open(chPath + '_processed_index_test', 'w', encoding = 'utf-8') as chF:
    for line in chTest:
        chF.write(line)

### 2. Define data loader

In [16]:
def get_dataset(indexFilePath, maxSenLen = 50): # ref: 1
    '''
    Shared by both English and Chinese dataset, return the defined tf.dataset object
    '''
    dataset = tf.data.TextLineDataset(indexFilePath)
    dataset = dataset.map( 
                            lambda line:  tf.strings.split(tf.strings.strip([line]), sep = ' ').values
                         )
    dataset = dataset.map(
                            lambda strList: tf.strings.to_number(strList, tf.int32) 
                         )
    
    return dataset


In [17]:
def get_model_input(xIdxFilePath, yIdxFilePath, maxSenLen = 50, batchSize = 250, vocabSize = 4000):    # , vocabularySize = 1000
    # ref: 1
    # bufferSize = 1000, prefetchSize = 10, seed = 1
    
    xDataSet = get_dataset(xIdxFilePath, maxSenLen)
    yDataSet = get_dataset(yIdxFilePath, maxSenLen)
    dataset = tf.data.Dataset.zip((xDataSet, yDataSet))
    
    def filter_length(xIdxList, yIdxList):
        '''
        <= maxSenLen + 1, as <eos> / <sos> will be removed
        when pad back, maxLen again
        '''
        xSenLen = tf.size(xIdxList)
        ySenLen = tf.size(yIdxList)
        xSignal = tf.logical_and(tf.greater(xSenLen, 2), tf.less_equal(xSenLen, maxSenLen))
        ySignal = tf.logical_and(tf.greater(ySenLen, 2), tf.less_equal(ySenLen, maxSenLen))    
        recordSignal = tf.logical_and(xSignal, ySignal)
        return recordSignal
    
    dataset = dataset.filter(filter_length)    # not cut short, directly get rid of the too long sentence
    dataset = dataset.shuffle(batchSize * 10)    # TODO extract as parameter, re-ini will re-shuffle the dataset, so does not use the seed here, for simplicity
    
#     def add_label_modify_tag(xIdxList, yIdxList):
#         '''
#         add the label column, 
#         and modify the <sos> and <eos> tag of input and label
#         '''
#         inputY = tf.one_hot(yIdxList, depth = vocabSize)
#         label = tf.one_hot(yIdxList[1:], depth = vocabSize)
        
#         return (xIdxList, inputY), label    # xIdxList <sos..eos>, yIdxList <sos..eos>, labelList <..eos>
    
#     dataset = dataset.map(lambda x,y: add_label_modify_tag(x, y))
    
#     # padding shape of each pair of sentences (bilingal), does not necessarily have identical shape[1] among multiple batches?
#     dataset = dataset.padded_batch(batchSize, padded_shapes = (((maxSenLen, ), (maxSenLen,None)), (maxSenLen,None)), drop_remainder = True)

    def add_label_modify_tag(xIdxList, yIdxList):
        '''
        add the label column, 
        and modify the <sos> and <eos> tag of input and label
        '''
        yOHList = tf.one_hot(yIdxList, depth = vocabSize + 4)
        return xIdxList, yOHList
    
    dataset = dataset.map(lambda x,y: add_label_modify_tag(x, y))
    
    # padding shape of each pair of sentences (bilingal), does not necessarily have identical shape[1] among multiple batches?
    dataset = dataset.padded_batch(batchSize, padded_shapes = ((maxSenLen, ), (maxSenLen, None)), drop_remainder = True)
    
#     dataset = dataset.prefetch(batchSize * 10)
    
    dataset = dataset.repeat()
    return dataset

In [67]:
# %%time
# dataset = get_model_input(enPath + '_processed_index', chPath + '_processed_index', 
#                             maxSenLen = maxSenLen, batchSize = batchSize, vocabSize = chVocabularySize) 
#                                 # bufferSize = bufferSize, prefetchSize = prefetchSize, seed = seed, vocabularySize = chVocabularySize

# iterator = dataset.make_initializable_iterator()
# next = iterator.get_next()
# ini = iterator.initializer

# with tf.Session() as sess:
#     for i in range(1):
#         sess.run(ini)    # to see if the random status is reset each time the iterator is re-initialized
#         for i in range(6):
#             pair = sess.run(next)
# #             x, y = pair[0]
# #             label = pair[1]
# #             print(x.shape, y.shape, label.shape)
# #             print(type(pair), type(x), type(y), type(label))
# #             print(x, y, label)

#             x = pair[0]
#             y = pair[1]
# #             label = pair[2]
#             print(x.shape, y.shape) # , label.shape
#             print(type(pair), type(x), type(y)) # , type(label)
#             print(y)

#         print()
        

(100, 20) (100, 20, 4004)
<class 'tuple'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
[[[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...


### 3. Attention model 2, v2

![architecture](./architecture.png)  
**--> Fig comes from ref 1**

In [18]:
class Translator2():
    def __init__(self, 
                 logger = None, 
                 loadTrainedModel = False,     # load trained translator
                 
                 modelName = None,    # persistance information
                 version = None,
                 checkpointDir = None, 
                 
                 loadSrcLanguageModel = False,    # load source trained language model, e.g. glov
                 srcLanguageModelPath = None,
                 loadTarLanguageModel = False,    # load target trained language model, e.g. 
                 tarLanguageModelPath = None,

                 senMaxLen = None,
                 hiddenUnitNum = None,    # LSTM units
                 srcVocabularySize = None,    # source language embedding input size
                 tarVocabularySize = None,    # target language embedding input size
                 srcEmbeddingSize = None,    # embedding output size
                 tarEmbeddingSize = None,  
                 energySize = None,    # energy size of attention mechanism
                 
                 droupOutRatio = None, 
                 batchSize = None, 
                 epochNum = None, 
                 verbose = None, 
                 validationRatio = None, 
                 patience = None):    # seperate the data feeding procedure out of the model

        
        self.lg = logger or Logger.get_logger(modelName)
        self.loadTrainedModel = loadTrainedModel    # use the trained classifier or not
        
        self.modelName = modelName    # persistance information
        self.version = version
        self.checkpointDir = checkpointDir
        self.checkpointPath = os.path.join(checkpointDir, modelName, str(version), 'train.ckpt')
        self.checkpointVisualPath = os.path.join(checkpointDir, modelName, str(version), 'visualization')

        self.loadSrcLanguageModel = loadSrcLanguageModel    # load source trained language model, e.g. glov
        self.srcLanguageModelPath = srcLanguageModelPath
        self.loadTarLanguageModel = loadTarLanguageModel    # load target trained language model, e.g. 
        self.tarLanguageModelPath = tarLanguageModelPath

        self.senMaxLen = senMaxLen    # input shape of the Input layer
        self.hiddenUnitNum = hiddenUnitNum    # LSTM units
        self.srcVocabularySize = srcVocabularySize
        self.tarVocabularySize = tarVocabularySize
        self.srcEmbeddingSize = srcEmbeddingSize    # embedding output size
        self.tarEmbeddingSize = tarEmbeddingSize
        self.energySize = energySize
        
        '''set the sizes according to the loaded language model'''
        if loadSrcLanguageModel:
            self.srcW2VModel = KeyedVectors.load_word2vec_format(srcLanguageModelPath)
            self.srcVocabularySize = len(self.srcW2VModel.index2word)
            self.srcEmbeddingSize = self.srcW2VModel.vector_size
        if loadTarLanguageModel:
            self.tarW2VModel = KeyedVectors.load_word2vec_format(tarLanguageModelPath)
            self.tarVocabularySize = len(self.tarW2VModel.index2word)
            self.tarEmbeddingSize = self.tarW2VModel.vector_size
        
        # Hyper parameters are set for the convenience of reload the trained model for continue training
        self.droupOutRatio = droupOutRatio 
        self.batchSize = batchSize
        self.epochNum = epochNum
        self.verbose = verbose
        self.validationRatio = validationRatio
        self.patience = patience
        
        self.model = None
        self.predictModel = None
        self._ini_model() 
    

    def _load_model(self):
        loadedFlag = False
        try:
            self.model.load_weights(self.checkpointPath)
            loadedFlag = True
            self.lg.info('---> pre-trained translator loaded')
        except Exception as e:
            traceback.format_exc(e)
            
        return loadedFlag

    
    def _load_language_model(self, embeddingName, w2VModel):
        embeddingWeights = np.concatenate([np.zeros((4, w2VModel.vector_size)), w2VModel.syn0], axis = 0)    # add vectors for helping tags <pad> <sos> <eos> <unk>
        self.model.get_layer(embeddingName).set_weights([embeddingWeights])    # ref: 22
        self.lg.info('%s embedding is loaded'%embeddingName)
    
    
    def _ini_model(self):
        self._construct_model()
        
        if self.loadTrainedModel:
            '''
            load pre-trained model
            '''
            loadedFlag = self._load_model()
            
        if self.loadSrcLanguageModel:
            '''
            load trained w2v model into embedding layer
            '''
            self._load_language_model('srcLanguageModel', self.srcW2VModel)
            
        if self.loadTarLanguageModel:
            self._load_language_model('tarLanguageModel', self.tarW2VModel)
            
            
    def __LSTM_creator(self, hiddenUnitNum, return_sequences = True, return_state = True):    # ref: 2
        layer = None
        if tf.test.is_gpu_available():
            layer = tf.keras.layers.CuDNNLSTM(hiddenUnitNum, return_sequences = return_sequences, return_state = return_state)
        else:
            layer = tf.keras.layers.LSTM(hiddenUnitNum, return_sequences = return_sequences, return_state = return_state)
        return layer
    
    
    def _construct_model(self):
            
        # layers
        srcSenIdx = tf.keras.layers.Input((self.senMaxLen, ), name = 'sourceLanguageIdx')    # the <eos> is cut
#         tarSenIdx = tf.keras.layers.Input((self.senMaxLen, ), name = 'tarLanguageIdx')    # the <eos> is cut

        hiddenUnitNum = self.hiddenUnitNum    # ref: 12
        statusIniLayer = tf.keras.layers.Lambda(lambda x:tf.zeros((tf.shape(x)[0], hiddenUnitNum)), 
                                                name = 'statusIniLayer')    # dynamic size according to batch size, ref: 10

#         srcLanguageModel = tf.keras.layers.Embedding(self.srcVocabularySize + 4, self.srcEmbeddingSize, name = 'srcLanguageModel') 
#         tarLanguageModel = tf.keras.layers.Embedding(self.tarVocabularySize + 4, self.tarEmbeddingSize, name = 'tarLanguageModel')

        srcLanguageModel = tf.keras.layers.Embedding(self.srcVocabularySize + 4, self.srcEmbeddingSize, name = 'srcLanguageModel', trainable = False) 
            # [batchSize, senMaxLen, embeddingSize]

        preAtLSTM = tf.keras.layers.Bidirectional(self.__LSTM_creator(self.hiddenUnitNum, return_state = False))    # a shape: (hiddenStateNum * 2)
        postAtLSTM = self.__LSTM_creator(self.hiddenUnitNum, return_sequences = False)    # c = tanh(w[a, x]) w ensures c has the correct length == len(a)

        concateTarContext = tf.keras.layers.Concatenate()    # for concate the target sentence and context together

        outputLayer = tf.keras.layers.Dense(self.tarVocabularySize + 4, activation = 'linear', name = 'outputLayer')    # each step output 1 index

        flatten = tf.keras.layers.Flatten()

        expandDimLayer = tf.keras.layers.Lambda(lambda x:tf.expand_dims(x, 1), name = 'expandDimLayer')

        getSosIdx = tf.keras.layers.Lambda(lambda x: tf.ones(tf.shape(x)[0], ), name = 'getSosIdx')
            # get first step <sos> indices with shape(None, )

        distribution2IndexLayer = tf.keras.layers.Lambda(lambda x: tf.math.argmax(x, axis = 1),  # tf.expand_dims(
                                                         name = 'distribution2IndexLayer')    
            # translate the ouput distribition over terms into index on terms
            # also maintain the last dim which only contains 1 element

        convertSwapLayer2D = tf.keras.layers.Lambda(lambda xList : tf.transpose(tf.convert_to_tensor(xList), perm = [1, 0]), name = 'convertSwapLayer2D')
        convertSwapLayer3D = tf.keras.layers.Lambda(lambda xList : tf.transpose(tf.convert_to_tensor(xList), perm = [1, 0, 2]), name = 'convertSwapLayer3D')

        dropoutLayer = tf.keras.layers.Dropout(0.2)

        # layers for attention
        repeatLayer = tf.keras.layers.RepeatVector(self.senMaxLen)
        concatePrePostStates = tf.keras.layers.Concatenate(axis = -1)
        dense1 = tf.keras.layers.Dense(self.energySize, activation = 'tanh')
        dense2 = tf.keras.layers.Dense(1, activation = 'relu')
        softmaxLayer = tf.keras.layers.Softmax(axis = -1)
        dotLayer = tf.keras.layers.Dot(axes = 1)


        # apply layer
        '''attention'''
        def __attention(preAtHiddenStateSeq, postAtHiddenState):    # ref: 3, 2
            '''
            attentions, the weights of activations of encoder

            preAtHiddenStateSeq with shape (m, senLen, 2 * n_a), 2 * n_a means 'bi-directional'
                the pre attention RNN ran with forward and backward before the post attention RNN ran.

            energy is the output of w([s, a_bi]) + b, input of softmax to get the attention
            '''

            postStates = repeatLayer(postAtHiddenState)
            concateTensors = concatePrePostStates([preAtHiddenStateSeq, postStates])
            temp = dense1(concateTensors)
            energies = dense2(temp)    # not scaled attentions
            attentions = softmaxLayer(energies)
            context = dotLayer([attentions, preAtHiddenStateSeq])    # three is a broadcasting with attentions in this procedure

            return context


        '''training model'''
        srcSenEmb = dropoutLayer(srcLanguageModel(srcSenIdx))    # ref: 13
#         tarSenEmb = dropoutLayer(tarLanguageModel(tarSenIdx))

        preAtHiddenStateSeq = preAtLSTM(srcSenEmb)    # Bidirectional only return the merged output

        outputSen = []

        postAtHiddenState = statusIniLayer(srcSenIdx)
        postAtCellState = statusIniLayer(srcSenIdx)

        for step in range(self.senMaxLen):    # iterate over the target sentence, no <eos> tag
            context = __attention(preAtHiddenStateSeq, postAtHiddenState)    # (hiddenStateNum*2, 1)
            postAtHiddenState, _, postAtCellState = postAtLSTM(context, initial_state = [postAtHiddenState, postAtCellState])

            outputDistribution = outputLayer(postAtHiddenState)    # word distribution
            outputSen.append(outputDistribution)    # for calculate the loss

        outputSenTranspose = convertSwapLayer3D(outputSen)    # (None, 20, vocab)

        self.model = model = tf.keras.Model(inputs = srcSenIdx, outputs = outputSenTranspose, name = 'translator_v3')    # ref: 7, 11
            # outputs convert to (batchSize, step, vocabulary)

        '''loss'''
        def __loss(real, pred):    # ref: 2
            '''
            make use of cross-entropy here
            real: (batchSize, senMaxLen)
            logit: (batchSize, senMaxLen, vocabularySize)
            '''
            mask = 1 - np.equal(real, 0)    # mask out the padded elements
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = real, logits = pred) * mask)    # ref: 6, use unscaled logits
            return loss

        self.model.compile(optimizer = 'adam', loss = __loss)
        self.model.summary()

        return model
    
    
    def train(self, train_input_fn = None, val_input_fn = None, epochs = 1, steps_per_epoch = 1, validation_steps = 1, verbose = 1):    # invoke the model.fit | , validation_split = 0.0
        trainDataset = train_input_fn()    # ref: 9, with shape ((x, y), label)
        valDataset = val_input_fn()
        self.model.fit(trainDataset, validation_data = valDataset, 
                       epochs = epochs, steps_per_epoch = steps_per_epoch, 
                       verbose = verbose, validation_steps = validation_steps,
                       callbacks = [tf.keras.callbacks.ModelCheckpoint(self.checkpointPath), 
                                    tf.keras.callbacks.TensorBoard(self.checkpointVisualPath, write_images = True)])    # , save_weights_only = 'True'
                                                                                                 
        
    def evaluate(self, xIdxList, yOHList):
        self.model.evaluate()
        tf.keras.Model().evaluate([xIdxList], [yOHList])
    
    
    def predict(self, srcSenIdx):    # invoke the model.evaluate
        predSenIdx = self.model.predict([srcSenIdx,], batch_size = self.batchSize)
        return predSenIdx

In [24]:
T2 = Translator2(     
                logger = None, 
    
                modelName = 'translator_v3',    # v3 is the big release, in this version, the input and output is modificated of the attention-2 model
                version = 2,
                loadTrainedModel = False,     # load trained translator
                checkpointDir = checkpointDir, 

                loadSrcLanguageModel = True,    # load source trained language model, e.g. glove
                srcLanguageModelPath = enW2VPath,
    
                loadTarLanguageModel = False,    # load target trained language model, e.g. 
                tarLanguageModelPath = None,

                senMaxLen = maxSenLen,
                hiddenUnitNum = hiddenUnitNum,    # LSTM units
                srcVocabularySize = None,    # source language embedding input size
                tarVocabularySize = chVocabularySize,    # target language embedding input size, needed for output layer
                srcEmbeddingSize = None,    # embedding output size
                tarEmbeddingSize = None,
                energySize = energySize,    # energy size of attention mechanism

                droupOutRatio = 0.2, 
                batchSize = batchSize,    # from input dataset
                epochNum = None,     # model.train
                verbose = None,    # model.train
                validationRatio = None,    # from validation dataset
                patience = None    # model.train
            )

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sourceLanguageIdx (InputLayer)  (None, 20)           0                                            
__________________________________________________________________________________________________
srcLanguageModel (Embedding)    (None, 20, 50)       20000200    sourceLanguageIdx[0][0]          
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 20, 50)       0           srcLanguageModel[0][0]           
__________________________________________________________________________________________________
statusIniLayer (Lambda)         (None, 64)           0           sourceLanguageIdx[0][0]          
                                                                 sourceLanguageIdx[0][0]          
__________

translator_v3 2019-05-15 17:10:25,092 [INFO] - <ipython-input-18-02b2c533e5d5>:90 srcLanguageModel embedding is loaded


In [25]:
T2.model.output_shape

(None, 20, 4004)

In [67]:
def train_input_fn():
    return get_model_input(enPath + '_processed_index_train', chPath + '_processed_index_train', 
                           maxSenLen = maxSenLen, batchSize = batchSize)

In [68]:
def val_input_fn():
    return get_model_input(enPath + '_processed_index_test', chPath + '_processed_index_test', 
                           maxSenLen = maxSenLen, batchSize = batchSize)

In [69]:
T2.train(train_input_fn = train_input_fn, val_input_fn = val_input_fn, epochs = 20, steps_per_epoch = 2, validation_steps = 2, verbose = 2) # stepsPerEpoch

Epoch 1/20
 - 52s - loss: 3.3233 - val_loss: 3.3275
Epoch 2/20
 - 5s - loss: 3.2853 - val_loss: 3.2808
Epoch 3/20
 - 4s - loss: 3.3820 - val_loss: 3.1704
Epoch 4/20
 - 5s - loss: 3.0938 - val_loss: 3.3057
Epoch 5/20
 - 5s - loss: 3.2666 - val_loss: 3.2113
Epoch 6/20
 - 5s - loss: 3.2581 - val_loss: 3.2549
Epoch 7/20
 - 5s - loss: 3.2587 - val_loss: 3.1145
Epoch 8/20
 - 5s - loss: 3.2302 - val_loss: 3.1709
Epoch 9/20
 - 5s - loss: 3.2539 - val_loss: 3.1278
Epoch 10/20
 - 4s - loss: 3.2764 - val_loss: 3.2399
Epoch 11/20
 - 4s - loss: 3.2149 - val_loss: 3.2068
Epoch 12/20
 - 6s - loss: 3.0888 - val_loss: 3.1183
Epoch 13/20
 - 4s - loss: 3.1632 - val_loss: 3.1865
Epoch 14/20
 - 4s - loss: 3.1524 - val_loss: 3.2132
Epoch 15/20
 - 5s - loss: 3.1458 - val_loss: 3.2328
Epoch 16/20
 - 4s - loss: 3.2030 - val_loss: 3.0791
Epoch 17/20
 - 5s - loss: 3.1372 - val_loss: 3.2280
Epoch 18/20
 - 4s - loss: 3.2696 - val_loss: 3.0880
Epoch 19/20
 - 4s - loss: 3.0762 - val_loss: 3.2784
Epoch 20/20
 - 5s - 

In [70]:
pEn3 = Preprocessor(clean_en, tokenize_en, add_tag, skip_line, enVocabularySize)
# pEn3.load_language_maps(enPath + '_idx2Lang')
pEn3.load_language_maps(os.path.join(datasetPath, 'enW2v_idx2Lang'))

pCh3 = Preprocessor()
# pCh3.load_language_maps(os.path.join(datasetPath, 'chW2v_idx2Lang'))
pCh3.load_language_maps(chPath + '_idx2Lang')

senIdxExample = pEn3.index_raw_sentence('This is a really good day')
senIdxPadExample = tf.keras.preprocessing.sequence.pad_sequences([senIdxExample], maxlen=maxSenLen, padding='post')
# labelPadExample = tf.zeros((maxSenLen, chVocabularySize + 4))

In [71]:
outputSenDistribution = T2.model.predict(x = [senIdxPadExample])[0]
transIdxList = np.argmax(outputSenDistribution, axis = -1)
print(transIdxList)
pCh3.retrive_terms(transIdxList)

[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

### Estimator

In [126]:
# strategy = tf.contrib.distribute.MirroredStrategy()
# config = how (train_distribute = strategy)

In [None]:
# tf.estimator.RunConfig()

In [1]:
# tf.reset_default_graph()
# T2_estimator = tf.keras.estimator.model_to_estimator(keras_model = T2.model)    # config = config, model_dir = checkpointDir

In [2]:
# T2_estimator.train(input_fn = model_input_fn, steps = 2)

### 4. Serving

##### 4.1. Export model

In [30]:
export_path = os.path.join(checkpointDir, T2.modelName, str(T2.version))    # ref: 17
export_path

'/root/shared/workspace/Dev/translate/checkpoints/translator_v3/2'

In [31]:
tf.saved_model.simple_save(tf.keras.backend.get_session(),     # ref: 16
                           export_path, 
                           inputs = {'sourceLanguageIdx': T2.model.input}, 
                           outputs = {'targetLanguageDistribution': T2.model.output})

! ls -l {export_path}

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /root/shared/workspace/Dev/translate/checkpoints/translator_v3/2/saved_model.pb
total 14616
-rwxrwxrwx 1 root root 14958933 May 15 17:15 saved_model.pb
drwxrwxrwx 1 root root     4096 May 15 17:15 variables


In [32]:
!saved_model_cli show --dir {export_path} --all    # ref: 16


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['sourceLanguageIdx'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 20)
        name: sourceLanguageIdx_1:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['targetLanguageDistribution'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 20, 4004)
        name: convertSwapLayer3D_1/transpose:0
  Method name is: tensorflow/serving/predict


##### 4.2. Request the serving model

In [33]:
 senIdxPadExample.tolist()

[[1, 41, 18, 11, 592, 223, 126, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [34]:
params = {"signature_name": "serving_default", "instances": senIdxPadExample.tolist()}
data = json.dumps(params).encode('utf-8')
print(data)

b'{"signature_name": "serving_default", "instances": [[1, 41, 18, 11, 592, 223, 126, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}'


In [35]:
url = 'http://192.168.31.101:8501/v1/models/translator_v3:predict'
req = urllib.request.Request(url, data)
opener = urllib.request.build_opener()
response = opener.open(req).read()
prediction = json.loads(response)['predictions']

In [36]:
np.array(prediction).shape

(1, 20, 4004)

In [37]:
transIdxList = np.argmax(prediction, axis = -1)[0]
print(transIdxList)
pCh3.retrive_terms(transIdxList)

[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

### Reference
1. https://blog.csdn.net/m0_38007695/article/details/84723848
2. https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
3. https://www.coursera.org/learn/nlp-sequence-models/home/week/3
4. https://blog.csdn.net/bmjhappy/article/details/80512917
5. https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/
6. https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2
7. https://stackoverflow.com/questions/53580544/why-do-i-get-error-while-trying-to-build-an-architecture-with-multiple-inputs-in
8. https://github.com/keras-team/keras/issues/6263
9. https://www.tensorflow.org/guide/keras#input_tfdata_datasets
10. https://stackoverflow.com/questions/50873615/how-to-obtain-the-runtime-batch-size-of-a-keras-model
11. https://github.com/keras-team/keras/issues/4781
12. https://github.com/keras-team/keras/issues/8343#issuecomment-385103183  
13. https://github.com/CharlesWu123/SelfStudyTF/blob/master/TED_process/seq2seq_train.py  
14. https://adamtiger.github.io/NNSharp/recurrents/  (kernel & recurrent kernel, recurrent activation & activation) 
15. https://blog.csdn.net/MyArrow/article/details/53445369  (orthagonal matrix)  
16. https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/serving/rest_simple.ipynb  
17. https://stackoverflow.com/questions/45544928/tensorflow-serving-no-versions-of-servable-model-found-under-base-path  
18. https://www.tensorflow.org/tfx/serving/api_rest  
19. Shen Li, Zhe Zhao, Renfen Hu, Wensi Li, Tao Liu, Xiaoyong Du, Analogical Reasoning on Chinese Morphological and Semantic Relations, ACL 2018.
20. https://github.com/Embedding/Chinese-Word-Vectors  
21. https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/serving_basic.md  
22. https://stackoverflow.com/questions/41162876/get-weight-matrices-from-gensim-word2vec  
23. https://stackoverflow.com/questions/53249304/how-to-get-the-list-all-existing-loggers-using-python-logging-module  

In [88]:
1/3

0.3333333333333333