In [10]:
import numpy as numpy
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time
import unicodedata


In [11]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.strip()
  w = '<start> ' + w + ' <end>'
  return w

In [12]:
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [13]:
# path = os.path.abspath('.')+'/spa-eng/spa.txt'

In [14]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
path = os.path.abspath('.')+'/drive/My Drive/spa.txt'

In [16]:
en, sp, _= create_dataset(path, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [17]:
len(en)

124325

In [18]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [19]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang, _ = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [20]:
num_examples = 30000
input_tensor, target_tensor, inp_lang_tok, targ_lang_tok = load_dataset(path, num_examples)
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [21]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))


24000 24000 6000 6000


In [22]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [23]:
print ("Input Language; index to word mapping")
convert(inp_lang_tok, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang_tok, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
9 ----> el
2477 ----> tamano
141 ----> si
438 ----> importa
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
1259 ----> size
132 ----> does
825 ----> matter
3 ----> .
2 ----> <end>


In [24]:
target_tensor.shape

(30000, 11)

In [34]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang_tok.word_index) + 1
vocab_tar_size = len(targ_lang_tok.word_index)+ 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [35]:
print(next(iter(dataset)))

(<tf.Tensor: shape=(64, 16), dtype=int32, numpy=
array([[  1,   5,  59, ...,   0,   0,   0],
       [  1,   8, 129, ...,   0,   0,   0],
       [  1,   6,  16, ...,   0,   0,   0],
       ...,
       [  1,  18, 327, ...,   0,   0,   0],
       [  1,   5,  23, ...,   0,   0,   0],
       [  1,   8,  61, ...,   0,   0,   0]], dtype=int32)>, <tf.Tensor: shape=(64, 11), dtype=int32, numpy=
array([[   1,   67,   11,   30,  390,    6,    2,    0,    0,    0,    0],
       [   1,   17,   27,   12,   29,   10,  105,    3,    2,    0,    0],
       [   1,    7,    8, 4562,   46,    3,    2,    0,    0,    0,    0],
       [   1,    5,   25,   49,  281,    3,    2,    0,    0,    0,    0],
       [   1,    4,   77,  330,    3,    2,    0,    0,    0,    0,    0],
       [   1,   27,   12,  392,   30,  328,    3,    2,    0,    0,    0],
       [   1,   70,   80,   95,   83,    3,    2,    0,    0,    0,    0],
       [   1,    7,   95,   46,  126,    3,    2,    0,    0,    0,    0],
       [   

In [33]:
x = tf.data.Dataset.from_tensor_slices(([1, 2, 3,4],[5,6,7,8])).shuffle(6)
list(x.as_numpy_iterator())
x = x.batch(3)
list(x.as_numpy_iterator())
# dataset = dataset.enumerate(start=5)
# for element in dataset.as_numpy_iterator():
#   print(element)


[(array([4, 2, 1], dtype=int32), array([8, 6, 5], dtype=int32)),
 (array([3], dtype=int32), array([7], dtype=int32))]

In [28]:
# print(next(iter(dataset)))

In [36]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
        return_sequences=True,
        return_state=True,
        recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [37]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()

In [38]:
sample_hidden.shape

TensorShape([64, 1024])

In [39]:
example_input_batch, example_target_batch = next(iter(dataset))
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [40]:
print('Encoder output shape: batch size, sequence length, units {}'.format(sample_output.shape))

Encoder output shape: batch size, sequence length, units (64, 16, 1024)


In [41]:
print('Encoder Hidden state: batch_size, units {}'.format(sample_hidden.shape))

Encoder Hidden state: batch_size, units (64, 1024)


In [45]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query,1)

    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values

    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [46]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch_size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_lengths) {}".format(attention_weights.shape))

Attention result shape: (batch_size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_lengths) (64, 16, 1)


In [59]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
    super(Decoder, self).__init__()
    self.batch_size = batch_size
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer = 'glorot_uniform')
    
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector,1), x], axis=-1)
    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [60]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print('decoder output shape: (batch_size, vocab_size) {}'.format(sample_decoder_output.shape))


decoder output shape: (batch_size, vocab_size) (64, 4817)
