<a href="https://colab.research.google.com/github/vvshyer/tensorflow2.0_learning/blob/master/seq2seq_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q tensorflow-gpu==2.0.0-beta1
import tensorflow as tf

[K     |████████████████████████████████| 348.9MB 58kB/s 
[K     |████████████████████████████████| 501kB 38.3MB/s 
[K     |████████████████████████████████| 3.1MB 39.7MB/s 
[?25h

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0-beta1
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)
matplotlib 3.0.3
numpy 1.16.4
pandas 0.24.2
sklearn 0.21.3
tensorflow 2.0.0-beta1
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [0]:
# 1. preprocessing data
# 2. build model
#   2.1 encoder
#   2.2 attention
#   2.3 decoder
# 3. evaluation
#   3.1 given sentence, return translated results
#   3.2 visualize results (attention)

In [0]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [0]:
import unicodedata

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) 
                   if unicodedata.category(c) != 'Mn')

In [0]:
import re ## 正则表达式
def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    
    s = re.sub(r"([?.!,¿])", r" \1 ", s) # 标点符号前后加空格
    s = re.sub(r'[" "]+', " ", s) # 空格去重
    
    s = re.sub(r'[^a-zA-Z?.!,¿]', " ", s) # 除了标点符号和字母外都是空格
    s = s.rstrip().strip() # 去掉前后空格
    
    s = '<start> ' + s + ' <end>'
    return s

In [26]:
def parse_data(filename):
    lines = open(filename, encoding='UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_sentence(en), preprocess_sentence(sp)) for en, sp in sentence_pairs
    ]
    return zip(*preprocessed_sentence_pairs)

en_dataset, sp_dataset = parse_data(path_to_file)
print(en_dataset[-1])
print(sp_dataset[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [25]:
## zip + 解包操作，先把每个元素解开，再zip
a = [(1, 2), (3, 4), (5, 6)]
c, d = zip(*a)
print(c, d)

(1, 3, 5) (2, 4, 6)


In [30]:
# 文本式数据转成id式
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None, filters='', split=' ')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input, max_length_output)

16 11


In [31]:
from sklearn.model_selection import train_test_split

input_train, input_eval, output_train, output_eval = train_test_split(
    input_tensor, output_tensor, test_size=0.2)

len(input_train), len(input_eval), len(output_train), len(output_eval)

(24000, 6000, 24000, 6000)

In [32]:
def convert(example, tokenizer):
    for t in example:
        if t != 0:
            print('%d --> %s' % (t, tokenizer.index_word[t]))
            
convert(input_train[0], input_tokenizer)
print()
convert(output_train[0], output_tokenizer)

1 --> <start>
85 --> ese
95 --> libro
7 --> es
14 --> de
54 --> ellos
3 --> .
2 --> <end>

1 --> <start>
20 --> that
113 --> book
8 --> is
979 --> theirs
3 --> .
2 --> <end>


In [0]:
def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(
        batch_size, drop_remainder = True)
    return dataset

batch_size = 64
epochs = 20

train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, 1, False)

In [35]:
for x,y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x)
    print(y)

(64, 16)
(64, 11)
tf.Tensor(
[[   1 1733   13 ...    0    0    0]
 [   1 1684   44 ...    0    0    0]
 [   1  293   10 ...    0    0    0]
 ...
 [   1    9   16 ...    0    0    0]
 [   1   13  327 ...    0    0    0]
 [   1  407  349 ...    0    0    0]], shape=(64, 16), dtype=int32)
tf.Tensor(
[[   1    4  769 2255    3    2    0    0    0    0    0]
 [   1  317   33 1353    3    2    0    0    0    0    0]
 [   1   36   15   31  165   37    2    0    0    0    0]
 [   1   19    8   13  278 1353    3    2    0    0    0]
 [   1    5    8    9 1717    3    2    0    0    0    0]
 [   1   27 1278  204    3    2    0    0    0    0    0]
 [   1   16   23 1698    3    2    0    0    0    0    0]
 [   1   14  145   39  131    3    2    0    0    0    0]
 [   1    6  108   73   68    3    2    0    0    0    0]
 [   1   16   67   29 1479    3    2    0    0    0    0]
 [   1    4   43   68    3    2    0    0    0    0    0]
 [   1   28  331   10    3    2    0    0    0    0    0]
 [   1