In [1]:
import tensorflow as tf
import sentencepiece as spm
import time
import collections
import numpy as np

## SentencePiece Tokenize

In [2]:
sp_enc = spm.SentencePieceProcessor()
sp_enc.Load("sentencepiece_model/enc.model")

sp_dec = spm.SentencePieceProcessor()
sp_dec.Load("sentencepiece_model/dec.model")

True

In [4]:
t = time.time()

train_enc = []
with open("./data/train.en", "r") as f:
    for line in f.readlines() :
        train_enc.append(sp_enc.EncodeAsIds(line))

print("%.2f minutes to process train.en" % ((time.time() - t) / 60))
t = time.time()
        
train_dec = []
with open("./data/train.de", "r") as f:
    for line in f.readlines() :
        train_dec.append(sp_dec.EncodeAsIds(line))
        
print("%.2f minutes to process train.de" % ((time.time() - t) / 60))

7.96 minutes to process train.en
8.64 minutes to process train.de


## TFRecord-Dataset Build

In [5]:
def create_int_feature(values):
    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
    return feature

In [17]:
t = time.time()

assert (len(train_enc) == len(train_dec))

enc_max_len = 128
dec_max_len = 128

writer = []

for f_num in range(0, 4) :
    writer.append(tf.python_io.TFRecordWriter('./data/wmt2014_' + str(f_num) + '.tfrecord'))
    
proceed_rate = 0.1
for i in range(len(train_enc)) :
    if i>0 and float(i) / float(len(train_enc)) > proceed_rate :
        print("%d percent proceeded" % int(100*proceed_rate))
        proceed_rate += 0.1

    if len(train_enc[i]) > enc_max_len :
        enc_input_id = train_enc[i][0:enc_max_len]
        enc_labels = [1] * enc_max_len
    else :
        enc_input_id = train_enc[i] + [0] * (enc_max_len - len(train_enc[i]))
        enc_labels = [1] * len(train_enc[i]) + [0] * (enc_max_len - len(train_enc[i]))

    if len(train_dec[i]) > dec_max_len :
        dec_input_id = train_dec[i][0:dec_max_len]
        dec_labels = [1] * dec_max_len
    else :
        dec_input_id = train_dec[i] + [0] * (dec_max_len - len(train_dec[i]))
        dec_labels = [1] * len(train_dec[i]) + [0] * (dec_max_len - len(train_dec[i]))

    features = collections.OrderedDict()

    features['enc_input_id'] = create_int_feature(enc_input_id)
    features['enc_input_mask'] = create_int_feature(enc_labels)
    features['dec_input_id'] = create_int_feature(dec_input_id)
    features['dec_input_mask'] = create_int_feature(dec_labels)

    example = tf.train.Example(features=tf.train.Features(feature=features))

    writer[i%len(writer)].write(example.SerializeToString())

for f_num in range(0, 4) :
    writer[f_num].close()

print("\n%.2f minutes to create TFRecord dataset" % ((time.time() - t) / 60))

10 percent proceeded
20 percent proceeded
30 percent proceeded
40 percent proceeded
50 percent proceeded
60 percent proceeded
70 percent proceeded
80 percent proceeded
89 percent proceeded

14.85 minutes to create TFRecord dataset


## Dataset load Test

In [18]:
tf.enable_eager_execution()

In [19]:
name_to_features = {
    'dec_input_id' : tf.FixedLenFeature([dec_max_len], tf.int64),
    'dec_input_mask' : tf.FixedLenFeature([dec_max_len], tf.int64),
    'enc_input_id' : tf.FixedLenFeature([enc_max_len], tf.int64),
    'enc_input_mask' : tf.FixedLenFeature([enc_max_len], tf.int64)
}

def _decode_record(record, name_to_features) :
    example = tf.parse_single_example(record, name_to_features)
    
    return example

filenames = []
for f_num in range(0, 4) :
    filenames.append('./data/wmt2014_' + str(f_num) + '.tfrecord')

d = tf.data.TFRecordDataset(filenames)

d = d.apply(tf.contrib.data.map_and_batch(
    lambda record : _decode_record(record, name_to_features),
    batch_size=1))

In [25]:
cnt = 0
select = 25

for info in d :
    cnt += 1
    if cnt < select :
        continue
        
    length = np.sum(info['enc_input_mask'][0])
    tok = [sp_enc.IdToPiece(int(token)).replace("\xe2\x96\x81", "") for token in info['enc_input_id'][0][0:length]]
    
    out = ""
    for token in tok :
        out += "%s " % token
        
    print(out)
    
    length = np.sum(info['dec_input_mask'][0])
    tok = [sp_dec.IdToPiece(int(token)).replace("\xe2\x96\x81", "") for token in info['dec_input_id'][0][0:length]]
    
    out = ""
    for token in tok :
        out += "%s " % token
        
    print(out)
    
    break

Apart from being Hungary ’ s principal political , commercial , industrial and transportation centre , the city of Budapest boast s sites , monuments and spas of worldwide renown . 
Budapest ist nicht nur das politische , wirtschaftliche , industrielle und verkehrs technische Herz Ungarns , sondern rühmt sich auch weltweit bekannter Sehenswürdigkei ten , Denkmäler und Bäder . 
