In [0]:
# https://blog.paperspace.com/neural-machine-translation-with-tensorflow/

In [0]:
# !pip install -q tensorflow==2.0.0-beta1

[K     |████████████████████████████████| 87.9MB 359kB/s 
[K     |████████████████████████████████| 501kB 43.2MB/s 
[K     |████████████████████████████████| 3.1MB 34.2MB/s 
[?25h

In [0]:
import tensorflow as tf
import unicodedata
import string
import numpy as np
import re
import matplotlib.pyplot as plt
from google.colab import drive

In [48]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
keras = tf.keras
print(tf.__version__)

2.2.0


In [50]:
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip  fra-eng.zip

--2020-06-14 15:29:20--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 172.67.173.198, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5982778 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2020-06-14 15:29:21 (14.1 MB/s) - ‘fra-eng.zip.1’ saved [5982778/5982778]

Archive:  fra-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: _about.txt              
replace fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: fra.txt                 


In [0]:
SOS_token = 0
EOS_token = 1

In [0]:
class Lang(object):
    def __init__(self, name):
        self.name = name
        self.word2int = {}
        self.word2count = {}
        self.int2word = {0 : "SOS", 1 : "EOS"}
        self.n_words = 2
        
    def addWord(self, word):
        if word not in self.word2int:
            self.word2int[word] = self.n_words
            self.word2count[word] = 1
            self.int2word[self.n_words] = word
            self.n_words += 1
            
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)

In [0]:
def unicodeToAscii(s):
    return "".join(c for c in unicodedata.normalize("NFD", s) \
                   if unicodedata.category(c) != "Mn")

In [0]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    
    s = re.sub(r"([!.?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z?.!]+", " ", s)
    return s

In [0]:
def load_dataset():
    with open("fra.txt",'r') as f:
        lines = f.readlines()
    
    pairs = [[normalizeString(pair) for pair in 
              line.strip().split('\t')] for line in lines]
    return pairs

In [0]:
pairs = load_dataset()

In [0]:
MAX_LENGTH = 10
def sentencetoIndexes(sentence, lang):
    indexes = [lang.word2int[word] for word in sentence.split()]
    indexes.append(EOS_token)
    return indexes

def filterPair(p):
    return len(p[0].split()) < MAX_LENGTH and \
len(p[1].split()) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

pairs = filterPairs(pairs)

In [58]:
pairs[0:5]

[['go .', 'va !', 'cc by . france attribution tatoeba .org cm wittydev '],
 ['hi .', 'salut !', 'cc by . france attribution tatoeba .org cm aiji '],
 ['hi .', 'salut .', 'cc by . france attribution tatoeba .org cm gillux '],
 ['run !',
  'cours !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'courez !',
  'cc by . france attribution tatoeba .org papabear sacredceltic ']]

In [0]:
def build_lang(lang1, lang2, max_length=10):
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    input_seq = []
    output_seq = []
    
    for pair in pairs:
        input_lang.addSentence(pair[1])
        output_lang.addSentence(pair[0])
    for pair in pairs:
        input_seq.append(sentencetoIndexes(pair[1], input_lang))
        output_seq.append(sentencetoIndexes(pair[0], output_lang))
    return keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_length, padding='post',
                                                      truncating='post'), \
keras.preprocessing.sequence.pad_sequences(output_seq, padding='post', truncating='post'), input_lang, output_lang

In [0]:
input_tensor, output_tensor, input_lang, output_lang = build_lang('fr', 'en')

In [61]:
print(input_tensor.shape)
input_tensor[0]

(123935, 10)


array([2, 3, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [62]:
input_lang.name

'fr'

In [0]:
BATCH_SIZE = 16
BUFFER_SIZE = len(input_tensor)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

In [0]:
class Encoder(keras.models.Model):
    def __init__(self, vocab_size, num_hidden=256, num_embedding=256, batch_size=16):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.num_hidden = num_hidden
        self.num_embedding = num_embedding
        self.embedding = keras.layers.Embedding(vocab_size, num_embedding)
        self.gru = keras.layers.GRU(num_hidden, return_sequences=True,
                                    recurrent_initializer='glorot_uniform',
                                   return_state=True)
        
    def call(self, x, hidden):
        embedded = self.embedding(x)
        # print('embedded.shape :',embedded.shape)
        rnn_out, hidden = self.gru(embedded, initial_state=hidden)
        return rnn_out, hidden
    def init_hidden(self):
        return tf.zeros(shape=(self.batch_size, self.num_hidden))

In [0]:
inputs, outputs = next(iter(dataset))
hidden = tf.zeros((16, 256))

In [0]:
encoder = Encoder(input_lang.n_words)

In [67]:
inputs[:3]

<tf.Tensor: shape=(3, 10), dtype=int32, numpy=
array([[  88,   89,   46,   14,   77,   86, 3351,    5,    1,    0],
       [ 105,  751,    5,    1,    0,    0,    0,    0,    0,    0],
       [  98,  411, 1041,  273,  606, 3758,    5,    1,    0,    0]],
      dtype=int32)>

In [0]:
e_outputs, e_hidden = encoder(inputs, hidden)

In [94]:
e_hidden.shape, e_outputs.shape

(TensorShape([16, 256]), TensorShape([16, 10, 256]))

In [0]:
class BahdanauAttention(keras.models.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
    
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self, encoder_out, hidden):
        #shape of encoder_out : batch_size, seq_length, hidden_dim 
        #shape of encoder_hidden : batch_size, hidden_dim 
        
        hidden = tf.expand_dims(hidden, axis=1) #out: (16, 1, 256)
        
        score = self.V(tf.nn.tanh(self.W1(encoder_out) + \
                                  self.W2(hidden))) #out: (16, 10, 1)
        
        attn_weights = tf.nn.softmax(score, axis=1)
        
        context =  attn_weights * encoder_out #out: ((16,10,1) * (16,10,256))=16, 10, 256
        context = tf.reduce_sum(context, axis=1) #out: 16, 256
        return context, attn_weights
        

In [0]:
attn = BahdanauAttention(256)

In [0]:
context, attn_weights = attn(e_outputs, e_hidden)

In [97]:
attn_weights.shape

TensorShape([16, 10, 1])

In [0]:
class Decoder(keras.models.Model):
    def __init__(self, vocab_size, dec_dim=256, embedding_dim=256):
        super(Decoder, self).__init__()
        
        self.attn = BahdanauAttention(dec_dim)
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(dec_dim, recurrent_initializer='glorot_uniform',
                                   return_sequences=True, return_state=True)
        self.fc = keras.layers.Dense(vocab_size)
        
    def call(self, x, enc_hidden, enc_out):
        #x.shape = (16, 1)
        #enc_out.shape = (16, 10, 256)
        #enc_hidden.shape = (16, 256)
        x = self.embedding(x)
        #x.shape = (16, 1, 256)
        context, attn_weights = self.attn(enc_out, enc_hidden)
        #context.shape = (16, 256)
        x = tf.concat((tf.expand_dims(context, 1), x), -1)
        #x.shape = (16, 1, e_c_hidden_size + d_c_embedding_size)
        r_out, hidden = self.gru(x, initial_state=enc_hidden)
        out = tf.reshape(r_out,shape=(-1, r_out.shape[2]))
        # out.shape = (16, 256)
        return self.fc(out), hidden, attn_weights

In [0]:
decoder = Decoder(output_lang.n_words)

In [0]:
input_tensor, output_tensor = next(iter(dataset))

In [0]:
x = np.expand_dims(output_tensor[:,1], -1)
# print(output_tensor.shape)
# print(x.shape)

In [35]:
# output_tensor[:, 1]

<tf.Tensor: shape=(16,), dtype=int32, numpy=
array([  82,   17, 1375,  221, 2685,   97,  247,   97,   82,   97,   90,
        122, 2938,   90,   31,   29], dtype=int32)>

In [0]:
def loss_fn(real, pred):
    criterion = keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                           reduction='none')
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    _loss = criterion(real, pred)
    mask = tf.cast(mask, dtype=_loss.dtype)
    _loss *= mask
    return tf.reduce_mean(_loss)

In [0]:
optimizer = tf.optimizers.Adam()

In [0]:
def train_step(input_tensor, target_tensor, enc_hidden):
    loss = 0.0
    with tf.GradientTape() as tape:
    
        batch_size = input_tensor.shape[0] # 16
        enc_output, enc_hidden = encoder(input_tensor, # (16,10)
                                         enc_hidden) # (16,256)
        # print('enc_output :',enc_output.shape) # (16, 10, 256)
        # print('enc_hidden :', enc_hidden.shape) # (16, 256)

        SOS_tensor = np.array([SOS_token])
        dec_input = tf.squeeze(tf.expand_dims([SOS_tensor]*batch_size, 1), -1)
        dec_hidden = enc_hidden
        # print('SOS_tensor :',SOS_tensor.shape) # (1,)
        # print('dec_hidden :',dec_hidden.shape) # (16, 256)

        # print("dec_input :", dec_input.shape)

        for tx in range(target_tensor.shape[1] - 1):
          
            dec_out, dec_hidden, _ = decoder(dec_input,     
                                             dec_hidden,    # (16, 256)
                                            enc_output)     # (16, 10, 256)

            # print("dec_out : ", dec_out.shape) # (16, 10916)
            loss += loss_fn(target_tensor[:, tx], dec_out)
            dec_input = tf.expand_dims(target_tensor[:, tx], 1)
            # print('dec_input :',dec_input.shape) # (16, 1)

    batch_loss = loss / target_tensor.shape[1]
    t_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, t_variables)
    optimizer.apply_gradients(zip(gradients, t_variables))
    return batch_loss
    

In [105]:
hidden = tf.zeros(shape=(16, 256))  # 16, 256
loss = train_step(input_tensor, output_tensor, hidden) # ip -> (16, 10), op -> (16,10)
print(loss)

tf.Tensor(6.9731355, shape=(), dtype=float32)


In [0]:
def checkpoint(model, name=None):
    if name is not None:
        model.save_weights('/content/drive/My Drive/{}.h5'.format(name))
    else:
        raise NotImplementedError

In [106]:
EPOCHS = 10
log_every = 50
steps_per_epoch = len(pairs) // BATCH_SIZE
loss_list = []

with tf.device('GPU:0'):
    for e in range(1, EPOCHS):
        
        total_loss = 0.0
        enc_hidden = encoder.init_hidden()
        
        for idx, (input_tensor, target_tensor) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(input_tensor, target_tensor, hidden)
            total_loss += batch_loss
            
            if idx % log_every == 0:
                loss_list.append(batch_loss)
                print("Epochs: {} batch_loss: {:.4f}".format(e, batch_loss))
                checkpoint(encoder, 'encoder')
                checkpoint(decoder, 'decoder')
                
        if e % 2 == 0:
            print("Epochs: {}/{} total_loss: {:.4f}".format(
            e, EPOCHS, total_loss / steps_per_epoch))

Epochs: 9 batch_loss: 0.1887
Epochs: 9 batch_loss: 0.2773
Epochs: 9 batch_loss: 0.1418
Epochs: 9 batch_loss: 0.2422
Epochs: 9 batch_loss: 0.2568
Epochs: 9 batch_loss: 0.2136
Epochs: 9 batch_loss: 0.2423
Epochs: 9 batch_loss: 0.2110
Epochs: 9 batch_loss: 0.2680
Epochs: 9 batch_loss: 0.2634
Epochs: 9 batch_loss: 0.1793
Epochs: 9 batch_loss: 0.2711
Epochs: 9 batch_loss: 0.2848
Epochs: 9 batch_loss: 0.2699
Epochs: 9 batch_loss: 0.4412
Epochs: 9 batch_loss: 0.2782
Epochs: 9 batch_loss: 0.3080
Epochs: 9 batch_loss: 0.2579
Epochs: 9 batch_loss: 0.1659
Epochs: 9 batch_loss: 0.2194
Epochs: 9 batch_loss: 0.2134
Epochs: 9 batch_loss: 0.3403
Epochs: 9 batch_loss: 0.2963
Epochs: 9 batch_loss: 0.3175
Epochs: 9 batch_loss: 0.3786
Epochs: 9 batch_loss: 0.2307
Epochs: 9 batch_loss: 0.2411
Epochs: 9 batch_loss: 0.1351
Epochs: 9 batch_loss: 0.2653
Epochs: 9 batch_loss: 0.3036
Epochs: 9 batch_loss: 0.2544
Epochs: 9 batch_loss: 0.1715
Epochs: 9 batch_loss: 0.2358
Epochs: 9 batch_loss: 0.2214
Epochs: 9 batc

In [0]:
def translate(sentence, max_length=10):
    result = ''
    attention_plot = np.zeros((10,10))https://blog.paperspace.com/neural-machine-translation-with-tensorflow/
    sentence = normalizeString(sentence)
    sentence = sentencetoIndexes(sentence, input_lang)
    sentence = keras.preprocessing.sequence.pad_sequences([sentence],padding='post',
                                                      maxlen=max_length, truncating='post')
    
    encoder_hidden = hidden = [tf.zeros((1, 256))]
    
    enc_out, enc_hidden = encoder(sentence, encoder_hidden)
    
    dec_hidden = enc_hidden
    SOS_tensor = np.array([SOS_token])
    print('SOS_tensor.shape: ', SOS_tensor.shape)
    print('dec_hidden.shape: ', dec_hidden.shape)
    print('enc_out.shape: ', enc_out.shape)
    print('expanded sos tensor: ',tf.expand_dims([SOS_tensor], 1).shape)
    dec_input = tf.squeeze(tf.expand_dims([SOS_tensor], 1), -1)
    print('dec_input.shape: ', dec_input.shape)

    for tx in range(max_length):
        dec_out, dec_hidden, attn_weights = decoder(dec_input,
                                                   dec_hidden, enc_out)
        print('dec_out.shape: ', dec_out.shape)
        print('dec_hidden.shape: ', dec_hidden.shape)
        print('attn_weights.shape: ', attn_weights.shape)
        attn_weights = tf.reshape(attn_weights, (-1, ))
        print('after reshape : attn_weights.shape: ', attn_weights.shape)
        attention_plot[tx] = attn_weights.numpy()

        pred = tf.argmax(dec_out, axis=1).numpy()
        print('pred.shape: ', pred.shape)
        result += output_lang.int2word[pred[0]] + " "
        if output_lang.int2word[pred[0]] == "EOS":
            break
        dec_input = tf.expand_dims(pred, axis=1)
        print('dec_input.shape: ', dec_input.shape)
    return result, attention_plot

In [0]:
sentence = "j'ai besoin de quelqu'un pour m'aider ?"
pred, attn_weights = translate(sentence)
print(pred)

i need somebody to help me ? EOS 


In [0]:
def plot_attention(attention, sentence, predicted_sentence):
    sentence = normalizeString(sentence)
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [0]:
from matplotlib import ticker

In [0]:
attn_weights = attn_weights[:len(pred.split(' ')), :len(sentence.split(' '))]
plot_attention(attn_weights, sentence.split(), pred.split())

AttributeError: ignored

In [0]:
encoder.load_weights('/content/gdrive/My Drive/encoder.h5')
decoder.load_weights('/content/gdrive/My Drive/decoder.h5')