Разобраться с задачей перевода с вниманием и без внимания (материалы в архиве с пометкой actual)

In [1]:
import os
import time

import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers

In [5]:
!wget http://www.manythings.org/anki/rus-eng.zip
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/
!ls /content/rus-eng/ -lah

--2021-05-12 16:35:09--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 172.67.173.198, 104.21.55.222, 2606:4700:3031::6815:37de, ...
Connecting to www.manythings.org (www.manythings.org)|172.67.173.198|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14042100 (13M) [application/zip]
Saving to: ‘rus-eng.zip’


2021-05-12 16:35:09 (23.6 MB/s) - ‘rus-eng.zip’ saved [14042100/14042100]

Archive:  rus-eng.zip
  inflating: rus-eng/rus.txt         
  inflating: rus-eng/_about.txt      
total 67M
drwxr-xr-x 2 root root 4.0K May 12 16:35 .
drwxr-xr-x 1 root root 4.0K May 12 16:35 ..
-rw-r--r-- 1 root root 1.5K Jan 24 02:56 _about.txt
-rw-r--r-- 1 root root  67M Jan 24 02:56 rus.txt


In [9]:
PATH_TO_FILE = "/content/rus-eng/rus.txt"
NUM_EXAMPLES = 100_000

In [7]:
def preprocess_sentence(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

def create_dataset(path, num_examples=None):
    with open(path, 'r', encoding='utf') as lines:
        lines = lines.read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]] for l in lines[:num_examples]]
    word_pairs = zip(*word_pairs)
    return word_pairs

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [10]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH_TO_FILE, NUM_EXAMPLES)

X_train, X_val, y_train, y_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [11]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64

steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2. попробовать поэкспериментировать с архитектурой энкодера и декодера

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=False, return_state=True, recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))

In [14]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.dence = layers.Dense(vocab_size)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.dence(output)
        return x, state

In [15]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [16]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    return tf.reduce_mean(loss)

In [17]:
checkpoint_dir = './training_nmt_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder
)

In [18]:
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [19]:
EPOCHS = 1
training = True

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    
if training:
    for epoch in range(EPOCHS):
        start = time.time()
        
        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for batch, (inp, targ) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_loss += batch_loss

            if batch % 200 == 0:
                print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
                checkpoint.save(file_prefix=checkpoint_prefix)
                print('checkpoint saved')

        print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
        print(f'{round(time.time() - start)} sec./batch\n')

Epoch 1 Batch 0 Loss 4.5888
checkpoint saved
Epoch 1 Batch 200 Loss 1.8872
checkpoint saved
Epoch 1 Batch 400 Loss 1.5194
checkpoint saved
Epoch 1 Batch 600 Loss 1.2949
checkpoint saved
Epoch 1 Batch 800 Loss 1.1850
checkpoint saved
Epoch 1 Batch 1000 Loss 1.0523
checkpoint saved
Epoch 1 Batch 1200 Loss 0.9832
checkpoint saved
Epoch 1 Loss 1.4365
3402 sec./batch



In [20]:
def evaluate(sentence):
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs],
        maxlen=input_tensor.shape[1],
        padding='post'
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    enc_hidden = [tf.zeros((1, units))]
    enc_hidden = encoder(inputs, enc_hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(target_tensor.shape[1]):
        pred, dec_hidden = decoder(dec_input, dec_hidden)
        pred = tf.argmax(pred[0]).numpy()
        
        result += targ_lang.index_word[pred] + ' '

        if targ_lang.index_word[pred] == '<end>':
            return result, sentence

        dec_input = tf.expand_dims([pred], 0)

    return result, sentence

In [21]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    
    print(f'Input: {sentence}')
    print(f'Predicted translation: {result}')

In [22]:
translate('Здесь хорошо.')

Input: <start> здесь хорошо . <end>
Predicted translation: it's a nice day . <end> 


In [23]:
translate('Я не смогу поехать.')

Input: <start> я не смогу поехать . <end>
Predicted translation: i can't go . <end> 


In [24]:
translate(u'Вы еще дома?')

Input: <start> вы еще дома ? <end>
Predicted translation: did you see it ? <end> 


In [25]:
translate(u'Вы все еще дома?')

Input: <start> вы все еще дома ? <end>
Predicted translation: did you tell tom why ? <end> 


In [26]:
translate(u'Попробуй сделать это.')

Input: <start> попробуй сделать это . <end>
Predicted translation: try it to be here . <end> 


In [27]:
translate(u'Я люблю, когда идет снег.')

Input: <start> я люблю , когда идет снег . <end>
Predicted translation: i like to be alone . <end> 


In [28]:
translate(u'Я никогда такого не делаю.')

Input: <start> я никогда такого не делаю . <end>
Predicted translation: i never saw that . <end> 


In [None]:
Перевод дальше первого слова не уходит, дальше не имеет отношения к инпуту.. 

Part 2

Разобраться с задачей перевода с вниманием и без внимания (материалы в архиве с пометкой actual)

In [36]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH_TO_FILE, NUM_EXAMPLES)

X_train, X_val, y_train, y_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [37]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64

steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [38]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [39]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=False, return_state=True, recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))

In [40]:
class BahdanauAttention(layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        
        self.w1 = layers.Dense(units)
        self.w2 = layers.Dense(units)
        self.v = layers.Dense(1)
        
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.v(tf.nn.tanh(
            self.w1(query_with_time_axis) + self.w2(values)
        ))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [41]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.dence = layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.dence(output)
        return x, state, attention_weights

In [42]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [43]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    return tf.reduce_mean(loss)

In [44]:
checkpoint_dir = './training_attention_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder
)

In [45]:
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            pred, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], pred)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [46]:
EPOCHS = 1
training = True

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    
if training:
    for epoch in range(EPOCHS):
        start = time.time()
        
        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for batch, (inp, targ) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_loss += batch_loss

            if batch % 200 == 0:
                print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
                checkpoint.save(file_prefix=checkpoint_prefix)
                print('checkpoint saved')

        print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
        print(f'{round(time.time() - start)} sec./batch\n')

Epoch 1 Batch 0 Loss 4.8031
checkpoint saved
Epoch 1 Batch 200 Loss 2.0797
checkpoint saved
Epoch 1 Batch 400 Loss 2.0387
checkpoint saved
Epoch 1 Batch 600 Loss 2.0424
checkpoint saved
Epoch 1 Batch 800 Loss 1.6692
checkpoint saved
Epoch 1 Batch 1000 Loss 1.6062
checkpoint saved
Epoch 1 Batch 1200 Loss 1.7264
checkpoint saved
Epoch 1 Loss 1.9411
5428 sec./batch



In [47]:
def evaluate(sentence):
    attention_plot = np.zeros((target_tensor.shape[1], input_tensor.shape[1]))
    
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs],
        maxlen=input_tensor.shape[1],
        padding='post'
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    enc_hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, enc_hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(target_tensor.shape[1]):
        pred, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        
        pred = tf.argmax(pred[0]).numpy()
        
        result += targ_lang.index_word[pred] + ' '

        if targ_lang.index_word[pred] == '<end>':
            return result, sentence, attention_plot

        dec_input = tf.expand_dims([pred], 0)

    return result, sentence, attention_plot

In [48]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    
    print(f'Input: {sentence}')
    print(f'Predicted translation: {result}')

In [53]:
import numpy as np


In [54]:
translate('Здесь хорошо.')

Input: <start> здесь хорошо . <end>
Predicted translation: i was a dog is a dog is a dog is 


In [55]:
translate('Я не смогу поехать.')

Input: <start> я не смогу поехать . <end>
Predicted translation: we we we we we we we we we we we 


In [56]:
translate(u'Вы еще дома?')

Input: <start> вы еще дома ? <end>
Predicted translation: you're a good is a good is a good is a 


In [58]:
translate(u'Вы все еще дома?')

Input: <start> вы все еще дома ? <end>
Predicted translation: you're a good is a good is a good is a 


In [59]:
translate(u'Попробуй сделать это.')

Input: <start> попробуй сделать это . <end>
Predicted translation: i was a dog is tom is tom is tom is 


In [60]:
translate(u'Я люблю, когда идет снег.')

Input: <start> я люблю , когда идет снег . <end>
Predicted translation: he . you . you . you . you . you 


In [61]:
translate(u'Я никогда такого не делаю.')

Input: <start> я никогда такого не делаю . <end>
Predicted translation: we we we we we we we we we we we 


Почему-то текст зацикливается. Перевод плохой