In [2]:
!pip3 install tensorflow_datasets dataclasses typing_extensions importlib_resources zipp tensorflow_metadata dill promise --no-deps

Collecting tensorflow_datasets
  Downloading https://files.pythonhosted.org/packages/35/b7/539d492854096df09229a7d0f373c5b4f26f896013f3e00c54172deefb9a/tensorflow_datasets-4.5.2-py3-none-any.whl (4.2MB)
[K    100% |████████████████████████████████| 4.2MB 454kB/s eta 0:00:011
[?25hCollecting dataclasses
  Downloading https://files.pythonhosted.org/packages/fe/ca/75fac5856ab5cfa51bbbcefa250182e50441074fdc3f803f6e76451fab43/dataclasses-0.8-py3-none-any.whl
Collecting typing_extensions
  Downloading https://files.pythonhosted.org/packages/05/e4/baf0031e39cf545f0c9edd5b1a2ea12609b7fcba2d58e118b11753d68cf0/typing_extensions-4.0.1-py3-none-any.whl
Collecting importlib_resources
  Downloading https://files.pythonhosted.org/packages/24/1b/33e489669a94da3ef4562938cd306e8fa915e13939d7b8277cb5569cb405/importlib_resources-5.4.0-py3-none-any.whl
Collecting zipp
  Downloading https://files.pythonhosted.org/packages/bd/df/d4a4974a3e3957fd1c1fa3082366d7fff6e428ddb55f074bf64876f8e8ad/zipp-3.6.0-py3-no

In [3]:
!pip3 install nltk

Collecting nltk
  Downloading https://files.pythonhosted.org/packages/c5/ea/84c7247f5c96c5a1b619fe822fb44052081ccfbe487a49d4c888306adec7/nltk-3.6.7-py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 901kB/s eta 0:00:01
[?25hCollecting regex>=2021.8.3 (from nltk)
  Downloading https://files.pythonhosted.org/packages/90/37/b681d58e0867ea8958dcde0458e0632ce41b29b2772505788a37e25f26b4/regex-2022.1.18-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (670kB)
[K    100% |████████████████████████████████| 675kB 2.7MB/s eta 0:00:01
[?25hCollecting tqdm (from nltk)
  Downloading https://files.pythonhosted.org/packages/63/f3/b7a1b8e40fd1bd049a34566eb353527bb9b8e9b98f8b6cf803bb64d8ce95/tqdm-4.62.3-py2.py3-none-any.whl (76kB)
[K    100% |████████████████████████████████| 81kB 7.5MB/s eta 0:00:01
[?25hCollecting joblib (from nltk)
  Downloading https://files.pythonhosted.org/packages/3e/d5/0163eb0cfa0b673aa4fe1cd3ea9d8a8

In [1]:
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import tensorflow as tf
import os
import json
import datetime
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
import time
%load_ext tensorboard

  from ._conv import register_converters as _register_converters


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [None]:
# Download datasets manually from https://www.statmt.org/wmt14/translation-task.html
# This notebook targets ru-en translation task, so we use here appropriate datasets:
# * 1M Yandex Corpus 
# * newstest2012...newstest2017

In [4]:
datasets_dir = 'datasets'
if not os.path.exists(datasets_dir):
    os.makedirs(datasets_dir)

In [5]:
EN_TOKENIZER_PATH = 'en_tokenizer.json'
RU_TOKENIZER_PATH = 'ru_tokenizer.json'
NUM_WORDS = 30000
NO_CACHED_TOKENIZER = False
MAX_LENGTH = 50 + 2


def init_tokenizer(tokenizer_path, texts):
    if not os.path.exists(tokenizer_path) or NO_CACHED_TOKENIZER:
        print('initializing tokenizer and storing it to', tokenizer_path)
        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=NUM_WORDS,
            filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
            lower=True, split=' ', char_level=False, oov_token='<UNK>',
            document_count=0
        )

        tokenizer.fit_on_texts(texts)
        with open(tokenizer_path, 'w') as f:
            f.write(tokenizer.to_json())
    else:
        print('loading tokenizer from', tokenizer_path)
        with open(tokenizer_path, 'r') as f:
            tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(f.read())
    return tokenizer


def load_datasets():
    config = tfds.translate.wmt.WmtConfig(
        version="0.0.1",
        language_pair=("ru", "en"),
        subsets={
            tfds.Split.TRAIN: ["yandexcorpus"],
            tfds.Split.VALIDATION: ["newstest2012", 'newstest2013', 'newstest2014', 'newstest2015', 'newstest2016', 'newstest2017'],
        },
    )
    
    data_dir = '/home/andysilv/yandexsdc/seminars/attention/tensorflow_datasets'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    builder = tfds.builder("wmt_translate", config=config, data_dir=data_dir)

    download_config = tfds.download.DownloadConfig(manual_dir=datasets_dir, extract_dir=datasets_dir)
    builder.download_and_prepare(download_config=download_config, download_dir=datasets_dir)
    return builder.as_dataset(as_supervised=True)


def preprocess_sentence(s):
    return '<start> ' + s.decode('utf-8') + ' <end>'


def build_tokenizers(datasets):
    ru_texts, en_texts = zip(*[
        (preprocess_sentence(ru), preprocess_sentence(en))
        for ru, en in datasets['train'].as_numpy_iterator()])
    en_tokenizer = init_tokenizer(EN_TOKENIZER_PATH, en_texts)
    ru_tokenizer = init_tokenizer(RU_TOKENIZER_PATH, ru_texts)
    
    return ru_tokenizer, en_tokenizer, ru_texts, en_texts

In [6]:
datasets = load_datasets()



In [7]:
ru_tokenizer, en_tokenizer, ru_texts, en_texts = build_tokenizers(datasets)

loading tokenizer from en_tokenizer.json
loading tokenizer from ru_tokenizer.json


In [8]:
input_tensor = ru_tokenizer.texts_to_sequences(ru_texts)
target_tensor = en_tokenizer.texts_to_sequences(en_texts)

In [9]:
input_tensor, target_tensor = zip(*[(ru, en) for ru, en in zip(input_tensor, target_tensor)
                                    if len(ru) <= MAX_LENGTH and len(en) <= MAX_LENGTH])

In [10]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(764070, 764070, 191018, 191018)

In [11]:
class GruCell(tf.keras.layers.Layer):
    def __init__(self, hidden_unit):
        super(GruCell, self).__init__()
        ortho_initializer = tf.keras.initializers.Orthogonal()
        normal_initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=0.01)
        self.W = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)
        self.Wr = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)
        self.Wz = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)
        self.U = tf.keras.layers.Dense(hidden_unit, kernel_initializer=ortho_initializer)
        self.Ur = tf.keras.layers.Dense(hidden_unit, kernel_initializer=ortho_initializer)
        self.Uz = tf.keras.layers.Dense(hidden_unit, kernel_initializer=ortho_initializer)

    def call(self, inputs):
        prev_h, x = inputs
        
        r = tf.sigmoid(self.Wr(x) + self.Ur(prev_h))
        z = tf.sigmoid(self.Wz(x) + self.Uz(prev_h))
        h_ = tf.tanh(self.W(x) + self.U(tf.multiply(prev_h, r)))
        h = tf.multiply((1 - z), prev_h) + tf.multiply(z, h_)
        return h
        

class GruWithContextCell(GruCell):
    def __init__(self, hidden_unit):
        super(GruWithContextCell, self).__init__(hidden_unit)
        
        normal_initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=0.01)
        self.C = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)
        self.Cz = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)
        self.Cr = tf.keras.layers.Dense(hidden_unit, kernel_initializer=normal_initializer)

    def call(self, inputs):
        prev_h, x, ctx = inputs
        r = tf.sigmoid(self.Wr(x) + self.Ur(prev_h) + self.Cr(ctx))
        z = tf.sigmoid(self.Wz(x) + self.Uz(prev_h) + self.Cz(ctx))
        h_ = tf.tanh(self.W(x) + self.U(tf.multiply(prev_h, r) + self.C(ctx)))
        h = tf.multiply((1 - z), prev_h) + tf.multiply(z, h_)
        return h

In [12]:
class GruLayer(tf.keras.layers.Layer):
    def __init__(self, hidden_unit, return_sequences, return_state, go_backwards=False):
        super(GruLayer, self).__init__()
        
        self._gru_cell = GruCell(hidden_unit)
        self._return_sequences = return_sequences
        self._return_state = return_state
        self._go_backwards = go_backwards
    
    def call(self, inputs):
        hidden, x = inputs
        all_hidden = []
        if not self._go_backwards:
            for i in range(x.shape[1]):
                hidden = self._gru_cell([hidden, x[:,i,:]])
                all_hidden.append(hidden)
        else:
            for i in range(x.shape[1] - 1, -1, -1):
                hidden = self._gru_cell([hidden, x[:,i,:]])
                all_hidden.append(hidden)
            all_hidden.reverse()
        if self._return_sequences and self._return_state:
            return all_hidden, hidden
        elif self._return_sequences:
            return all_hidden
        elif self._return_state:
            return hidden
        

        

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.forward_gru = GruLayer(
            enc_units, return_sequences=True, 
            return_state=False)
        self.backward_gru = GruLayer(
            enc_units, return_sequences=True, 
            return_state=True)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        forward_output = self.forward_gru((hidden, x))
        backward_output, state = self.backward_gru((hidden, x))
        output = tf.concat([forward_output, backward_output], axis=-1)
        output = tf.transpose(output, perm=[1, 0, 2])
        return output, state
    
    def initialize_hidden_state(self, batch_size):
        return tf.zeros((batch_size, self.enc_units))

In [14]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, maxout_size):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = GruWithContextCell(dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.Ws = tf.keras.layers.Dense(self.dec_units)
        self.W0 = tf.keras.layers.Dense(self.dec_units)
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.U0 = tf.keras.layers.Dense(2 * maxout_size)
        self.V0 = tf.keras.layers.Dense(2 * maxout_size)
        self.C0 = tf.keras.layers.Dense(2 * maxout_size)
        self.V = tf.keras.layers.Dense(1)
        self.maxout = tfa.layers.Maxout(maxout_size)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, hidden_size)
        score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        x = tf.reshape(x, (-1, x.shape[2]))
        
        state = self.gru([hidden, x, context_vector])
        
        t_ = self.U0(hidden) + self.V0(x) + self.C0(context_vector)
        t = self.maxout(t_)
        x = self.fc(t)
        return x, state, attention_weights
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        # x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        # output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        # output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        # x = self.fc(output)
        
        # return x, state, attention_weights
        
    def initialize_hidden_state(self, enc_hidden):
        return tf.tanh(self.Ws(enc_hidden))

In [15]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 80
SORT_BATCH_SIZE = BATCH_SIZE * 20
N_BATCH = BUFFER_SIZE // BATCH_SIZE
WORDS_EMBEDDING_SIZE = 256
HIDDEN_STATES = 1000
INPUT_VOCAB_SIZE = NUM_WORDS + 1  # pad
TARGET_VOCAB_SIZE = NUM_WORDS + 1  # pad

In [16]:
encoder = Encoder(INPUT_VOCAB_SIZE, WORDS_EMBEDDING_SIZE, HIDDEN_STATES)
decoder = Decoder(TARGET_VOCAB_SIZE, WORDS_EMBEDDING_SIZE, HIDDEN_STATES, 500)

In [17]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    # ignore paddings in loss, which have 0 index
    mask = 1 - np.equal(real, 0)
    nonzeros = np.count_nonzero(real)
    assert nonzeros > 0
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_) * real.shape[0] / nonzeros

In [18]:
checkpoint_dir = './training_checkpoints_custom'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [19]:
latest = tf.train.load_checkpoint(checkpoint_dir)
# try:
#     checkpoint.restore(latest)

#     encoder = checkpoint.encoder
#     decoder = checkpoint.decoder
# except Excepation as e:
#     print(e)



checkpoint.restore(latest)

TypeError: Expected binary or unicode string, got <tensorflow.python.util._pywrap_checkpoint_reader.CheckpointReader object at 0x7fee8c898d88>

In [None]:
!ls training_checkpoints/ -lsh

In [18]:
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
train_bleu = tf.keras.metrics.Mean('train_bleu', dtype=tf.float32)
test_bleu = tf.keras.metrics.Mean('test_bleu', dtype=tf.float32)

In [19]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape_custom_gru/' + current_time + '/train'
test_log_dir = 'logs/gradient_tape_custom_gru/' + current_time + '/test'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

In [20]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

def translate_encoded(input_tensor):
    num_sentences = len(input_tensor)
    hidden = encoder.initialize_hidden_state(num_sentences)
    
    enc_output, enc_hidden = encoder(input_tensor, hidden)
    dec_hidden = decoder.initialize_hidden_state(enc_hidden)
    dec_input = np.array([[en_tokenizer.word_index['<start>']]])
    
    beam_width = 3
    max_len = 50
    initial_result = {
        "result": [en_tokenizer.word_index['<start>']],
        "log_prob": 0
    }
    
    results_by_sentence = [[initial_result] for i in range(num_sentences)]
    current_states = [dict(
        dec_hidden=dec_hidden[idx],
        dec_input=dec_input,
        sentence_index=idx,
        enc_output=enc_output[idx],
        **initial_result
    ) for idx in range(num_sentences)]
    
    for t in range(1, max_len):
        new_current_states_by_sentence = [[] for sentence_index in range(num_sentences)]
        
        for current_states_chunk in chunks(current_states, BATCH_SIZE):            
            dec_input = tf.concat([cs['dec_input'] for cs in current_states_chunk], -2)
            enc_output = tf.concat([tf.expand_dims(cs['enc_output'], 0) for cs in current_states_chunk], 0)
            dec_hidden = tf.concat([tf.expand_dims(cs['dec_hidden'], 0) for cs in current_states_chunk], 0)
            predictions_batch, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            max_prob_inds_batch = np.argpartition(predictions_batch, -beam_width, axis=-1)[:, -beam_width:]
            for idx, (current_state, max_prob_inds, predictions) in enumerate(zip(
                    current_states_chunk, max_prob_inds_batch, predictions_batch)):
                probs = np.exp(predictions) / np.sum(np.exp(predictions))
                sentence_index = current_state['sentence_index']
                for ind in max_prob_inds:
                    res = {
                        'log_prob': current_state['log_prob'] + np.log(probs[ind]),
                        'result': current_state['result'] + [ind]
                    }
                    results_by_sentence[sentence_index].append(res)

                    if ind == en_tokenizer.word_index['<end>']:
                        continue
                    new_current_states_by_sentence[sentence_index].append(dict(
                        dec_hidden=dec_hidden[idx],
                        dec_input=np.array([[ind]]),
                        enc_output=current_states_chunk[idx]['enc_output'],
                        sentence_index=sentence_index,
                        **res
                    ))
        new_current_states = []
        for new_states in new_current_states_by_sentence:
            new_states.sort(key=lambda x: x['log_prob'])
            new_current_states.extend(new_states[-beam_width:])
        current_states = new_current_states
    
    for results in results_by_sentence:
        for r in results:
            r['normalized_log_prob'] = r['log_prob'] / len(r['result'])
        results.sort(key=lambda k: -r['normalized_log_prob'])
    return results_by_sentence


def translate(texts):
    if isinstance(texts, bytes):
        texts = [texts]
    texts = [preprocess_sentence(t) for t in texts]
    input_tensor = ru_tokenizer.texts_to_sequences(texts)
    max_len = max(len(t) for t in input_tensor)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        input_tensor, 
        maxlen=max_len, padding='post', value=0)
    return translate_encoded(input_tensor)
    
    
def evaluate_bleu_on_batch(ru, en):
    results_by_sentence = translate_encoded(ru)
    chencherry = nltk.bleu_score.SmoothingFunction()
    translations = []
    for results in results_by_sentence:
        translation = ''
        for r in results:
            if r['result'][-1] != en_tokenizer.word_index['<end>']:
                continue
            translation = r['result'][1:-1]
            break
        translations.append(translation)
    return nltk.bleu_score.corpus_bleu([[np.trim_zeros(ref)[1:-1]] for ref in en], translations, smoothing_function=chencherry.method7)

In [21]:
def train_step(encoder, decoder, optimizer, inp, targ):
    encoder_initial_hidden = encoder.initialize_hidden_state(inp.shape[0])
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, encoder_initial_hidden)

        dec_hidden = decoder.initialize_hidden_state(enc_hidden)
  
        dec_input = tf.expand_dims(targ[:, 0], 1)
        # Teacher forcing - feeding the target as the next input
        loss = 0
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = loss / (targ.shape[1] - 1)
    variables = encoder.variables + decoder.variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    train_loss(batch_loss)
    return batch_loss


def test_step(encoder, decoder, inp, targ):
    encoder_initial_hidden = encoder.initialize_hidden_state(inp.shape[0])
    enc_output, enc_hidden = encoder(inp, encoder_initial_hidden)
    dec_hidden = decoder.initialize_hidden_state(enc_hidden)
    dec_input = tf.expand_dims(targ[:, 0], 1)
    loss = 0
    for t in range(1, targ.shape[1]):
        # passing enc_output to the decoder
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss += loss_function(targ[:, t], predictions)
        # using teacher forcing
        dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = loss / (targ.shape[1] - 1)

    test_loss(batch_loss)
    return batch_loss


In [22]:
def pad_tensors(inp, tar):
    max_length_inp = max(len(x) for x in inp)
    max_length_tar = max(len(x) for x in tar)
    inp = tf.keras.preprocessing.sequence.pad_sequences(
        inp, maxlen=max_length_inp, padding='post', value=0)
    tar = tf.keras.preprocessing.sequence.pad_sequences(
        tar, maxlen=max_length_tar, padding='post', value=0)
    return inp, tar

In [None]:
EPOCHS = 10

bleu_every_n_steps = 10
log_metrics_every_n_steps = 100

val_bleu_iter = iter(zip(chunks(input_tensor_val, BATCH_SIZE),
                     chunks(target_tensor_val, BATCH_SIZE)))
val_loss_iter = iter(zip(chunks(input_tensor_val, BATCH_SIZE),
                     chunks(target_tensor_val, BATCH_SIZE)))

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    train_dataset = zip(chunks(input_tensor_train, SORT_BATCH_SIZE),
                        chunks(target_tensor_train, SORT_BATCH_SIZE))
    val_dataset = zip(chunks(input_tensor_val, BATCH_SIZE),
                      chunks(target_tensor_val, BATCH_SIZE))
    
    for sort_batch, (inp_batch, targ_batch) in enumerate(train_dataset):
        lengths = [max(len(x), len(y)) for (x, y) in zip(inp_batch, targ_batch)]
        permutation = np.argsort(lengths)
        inp_batch = np.array(inp_batch)[permutation]
        targ_batch = np.array(targ_batch)[permutation]
        
        chunked_input = chunks(inp_batch, BATCH_SIZE)
        chunked_target = chunks(targ_batch, BATCH_SIZE)
        for batch, (inp, targ) in enumerate(zip(chunked_input, chunked_target)):
            inp, targ = pad_tensors(inp, targ)
            
            batch_loss = train_step(encoder, decoder, optimizer, inp, targ)
            total_loss += batch_loss
            if step % bleu_every_n_steps == 0:
                train_bleu(evaluate_bleu_on_batch(inp, targ))
                inp, tar = next(val_bleu_iter)
                inp, tar = pad_tensors(inp, tar)
                test_bleu(evaluate_bleu_on_batch(inp, tar))
                test_step(encoder, decoder, inp, tar)
            if step % log_metrics_every_n_steps == 0:
                print('Epoch {} Batch {} Loss {:.4f} bleu {:.4f}'.format(
                        epoch + 1,
                        step,
                        batch_loss.numpy(),
                        train_bleu.result().numpy()))
                log_step = step // log_metrics_every_n_steps
                with train_summary_writer.as_default():
                    tf.summary.scalar('loss', train_loss.result(), step=log_step)
                    tf.summary.scalar('bleu', train_bleu.result(), step=log_step)
                with test_summary_writer.as_default():
                    tf.summary.scalar('loss', test_loss.result(), step=log_step)
                    tf.summary.scalar('bleu', test_bleu.result(), step=log_step)
                train_loss.reset_states()
                train_bleu.reset_states()
                test_loss.reset_states()
                test_bleu.reset_states()
            step += 1


    # saving (checkpoint) the model every 2 epochs
    checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))



Epoch 1 Batch 13700 Loss 2.5595 bleu 0.1885
Epoch 1 Batch 13800 Loss 2.4354 bleu 0.1864
Epoch 1 Batch 13900 Loss 2.2769 bleu 0.1687
Epoch 1 Batch 14000 Loss 2.4552 bleu 0.1894
Epoch 1 Batch 14100 Loss 2.2641 bleu 0.1364
Epoch 1 Batch 14200 Loss 2.9326 bleu 0.1750
Epoch 1 Batch 14300 Loss 2.2852 bleu 0.1588
Epoch 1 Batch 14400 Loss 2.3218 bleu 0.1794
Epoch 1 Batch 14500 Loss 2.3251 bleu 0.1595
Epoch 1 Batch 14600 Loss 2.8363 bleu 0.1813
Epoch 1 Batch 14700 Loss 2.3403 bleu 0.1702
Epoch 1 Batch 14800 Loss 2.4451 bleu 0.1879
Epoch 1 Batch 14900 Loss 2.6534 bleu 0.1428
Epoch 1 Batch 15000 Loss 2.7176 bleu 0.1840
Epoch 1 Batch 15100 Loss 2.0133 bleu 0.1704
Epoch 1 Batch 15200 Loss 2.3532 bleu 0.1660
Epoch 1 Batch 15300 Loss 2.9764 bleu 0.1930
Epoch 1 Batch 15400 Loss 2.2706 bleu 0.1681
Epoch 1 Batch 15500 Loss 2.3537 bleu 0.1680
Epoch 1 Batch 15600 Loss 1.9770 bleu 0.1743
Epoch 1 Batch 15700 Loss 1.8677 bleu 0.1723
Epoch 1 Batch 15800 Loss 2.4400 bleu 0.1793
Epoch 1 Batch 15900 Loss 1.8573 

In [None]:
for (ru, en) in val_dataset.as_numpy_iterator():
    x = evaluate_bleu_on_batch(ru, en)
    print(x, type(x))
    break

In [None]:
texts = [s.encode('utf-8') for s in ['Довольно странно', 'Хочется выпить чай', 'Когда уже наконец мы выиграем!']]
results_by_sentence = translate(texts)

for results in results_by_sentence:
    for r in results:
        if r['result'][-1] != en_tokenizer.word_index['<end>']:
            continue
        translaton = en_tokenizer.sequences_to_texts([r['result'][1:-1]])
        print(translaton)
    print('======')

In [None]:
tf.config.experimental.get_memory_usage('GPU:0')


In [None]:
print(ref, r['result'])

In [None]:
chencherry = nltk.bleu_score.SmoothingFunction()

In [None]:
nltk.bleu_score.corpus_bleu?

In [None]:
[ref] = en_tokenizer.texts_to_sequences([en.decode('utf-8')])
nltk.bleu_score.corpus_bleu([[ref]], [r['result'][1:-1]], smoothing_function=chencherry.method7)

In [None]:
for (ru, en) in datasets['validation'].as_numpy_iterator():
    results = translate(ru)
    for r in results:
        if r['result'][-1] != en_tokenizer.word_index['<end>']:
            continue
            
        print(en_tokenizer.sequences_to_texts([r['result']]))
        break
    print(en)    
    nltk.bleu()
    #print(preprocess_sentence(ru))
    #print(preprocess_sentence(en))
    break

In [25]:
results = translate('Он решил выйти на прогулку'.encode('utf-8'))
for r in results[0]:
    if r['result'][-1] != en_tokenizer.word_index['<end>']:
        continue
    print(en_tokenizer.sequences_to_texts([r['result']]))

['<start> <UNK> <end>']
['<start> <UNK> decided <end>']
['<start> it decided <end>']
['<start> he decided <end>']
['<start> he decided to take <end>']
['<start> he decided to walk <end>']
['<start> he decided to go <end>']
['<start> he decided to walk in <end>']
['<start> he decided to go to <end>']
['<start> he decided to go to a <end>']
['<start> he decided to go to the <end>']
['<start> he decided to take a walk <end>']
['<start> he decided to go to the <UNK> <end>']
['<start> he decided to go to the walk <end>']
['<start> he decided to go to a walk <end>']
["<start> he decided to go to the <UNK> ' <end>"]
["<start> he decided to go to a walk ' <end>"]
['<start> he decided to go to a walk in <end>']
["<start> he decided to go to a walk ' ' <end>"]
['<start> he decided to go to a walk in a <end>']
['<start> he decided to go to a walk in the <end>']
['<start> he decided to go to a walk in the <UNK> <end>']
['<start> he decided to go to a walk in the way <end>']
['<start> he decided to