<a href="https://colab.research.google.com/github/thomas-chauvet/names_transliteration/blob/master/names_translation/model/names_translation_with_nmt_with_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Names translation - Neural machine translation with attention

This notebook trains a sequence to sequence (seq2seq) model to translitate names with arabic characters to names in latin character. Usually we call this task "romanization". It is the task to transform string from one alphabet to latin alphanet.

After training the model in this notebook, you will be able to input an arabic name, such as *محمد‎*, and return the transliteration/translation of this name: *mohammad*.

## Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import json
import time

from pathlib import Path
import pandas as pd

import logging
import string
from typing import List, Union

## Download and prepare the dataset

We'll use three datasets:
*   Google transliteration dataset from repository on [github](https://github.com/google/transliteration). Example: *عادل	adel*
*   ANETAC dataset on [github](https://github.com/MohamedHadjAmeur/ANETAC). Example: *PERSON Adel اديل*. For this dataset we'll filter on *PERSON* only,
*   NETranliteration COLING 2018 dataset on [github](https://github.com/steveash/NETransliteration-COLING2018/blob/master/data/wd_arabic.normalized.aligned.tokens).

We already cleaned and prepared a concatenated dataset in a dedicated [repository](https://github.com/thomas-chauvet/names_transliteration).

After downloading the dataset, we will clean them and concatenate the two in one.

This notebook is based on Tensorflow tutorial [Neural machine translation with attention](https://www.tensorflow.org/tutorials/text/nmt_with_attention).

### Download data

In [2]:
data_path = Path.cwd().parent / "data"
data_path.mkdir(parents=True, exist_ok=True)
df = pd.read_csv("https://raw.githubusercontent.com/thomas-chauvet/names_transliteration/master/data/clean/arabic_english.csv")
df.to_csv(data_path / "ar2en.csv", index=False)
df.head()

Unnamed: 0,arabic,english
0,العالي,aal
1,اعشي,asha
2,اعثم,atham
3,اا,aa
4,ادلاند,aadland


### Prepare data for training

In [3]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer


def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [["!" + w + "?" for w in l.split(',')]  for l in lines[:num_examples]]

  return zip(*word_pairs)


def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  inp_lang, targ_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [4]:
# Try experimenting with the size of that dataset
num_examples = None
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(data_path / "ar2en.csv", num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

print(f"Size of the longest string for arabic is {max_length_inp}")
print(f"Size of the longest string for english is {max_length_targ}")

Size of the longest string for arabic is 26
Size of the longest string for english is 27


In [5]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

94341 94341 23586 23586


In [6]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [7]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> !
13 ----> م
4 ----> ا
6 ----> ن
19 ----> ج
3 ----> ي
11 ----> ت
2 ----> ?

Target Language; index to word mapping
1 ----> !
13 ----> m
3 ----> a
6 ----> n
26 ----> j
4 ----> e
4 ----> e
11 ----> t
2 ----> ?


### Create a tf.data dataset

In [8]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [9]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 26]), TensorShape([64, 27]))

## Write the encoder and decoder model

### Encoder

In [10]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  # @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

### Attention

In [11]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

### Decoder

In [12]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, inputs, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(inputs)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

### Instanciate encoder, attention, decoder

In [13]:
def get_model(vocab_inp_size: int, vocab_tar_size: int, embedding_dim: int, units: int, batch_sz: int):
  encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_sz)
  attention = BahdanauAttention(10)
  decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_sz)
  return encoder, attention, decoder

In [14]:
encoder, attention_layer, decoder = get_model(vocab_inp_size, vocab_tar_size, embedding_dim, units, BATCH_SIZE)

### Check shapes

In [15]:
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print(
    "Encoder output shape: (batch size, sequence length, units) {}".format(
        sample_output.shape
    )
)
print("Encoder Hidden state shape: (batch size, units) {}".format(sample_hidden.shape))

attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print(
    "Attention weights shape: (batch_size, sequence_length, 1) {}".format(
        attention_weights.shape
    )
)

sample_decoder_output, _, _ = decoder(
    tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output
)

print(
    "Decoder output shape: (batch_size, vocab size) {}".format(
        sample_decoder_output.shape
    )
)


Encoder output shape: (batch size, sequence length, units) (64, 26, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 26, 1)
Decoder output shape: (batch_size, vocab size) (64, 55)


In [16]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  14336     
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
Total params: 3,952,640
Trainable params: 3,952,640
Non-trainable params: 0
_________________________________________________________________


In [17]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  14080     
_________________________________________________________________
gru_1 (GRU)                  multiple                  7084032   
_________________________________________________________________
dense_3 (Dense)              multiple                  56375     
_________________________________________________________________
bahdanau_attention_1 (Bahdan multiple                  2100225   
Total params: 9,254,712
Trainable params: 9,254,712
Non-trainable params: 0
_________________________________________________________________


## Define the optimizer and the loss function

In [18]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [19]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training

1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.
2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['!']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [21]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.1597
Epoch 1 Batch 100 Loss 0.7472
Epoch 1 Batch 200 Loss 0.6967
Epoch 1 Batch 300 Loss 0.3040
Epoch 1 Batch 400 Loss 0.3284
Epoch 1 Batch 500 Loss 0.2817
Epoch 1 Batch 600 Loss 0.2748
Epoch 1 Batch 700 Loss 0.2781
Epoch 1 Batch 800 Loss 0.2133
Epoch 1 Batch 900 Loss 0.2316
Epoch 1 Batch 1000 Loss 0.2334
Epoch 1 Batch 1100 Loss 0.2514
Epoch 1 Batch 1200 Loss 0.2082
Epoch 1 Batch 1300 Loss 0.2301
Epoch 1 Batch 1400 Loss 0.1857
Epoch 1 Loss 0.3316
Time taken for 1 epoch 295.84784150123596 sec

Epoch 2 Batch 0 Loss 0.2388
Epoch 2 Batch 100 Loss 0.2027
Epoch 2 Batch 200 Loss 0.1792
Epoch 2 Batch 300 Loss 0.1936
Epoch 2 Batch 400 Loss 0.2057
Epoch 2 Batch 500 Loss 0.2095
Epoch 2 Batch 600 Loss 0.2284
Epoch 2 Batch 700 Loss 0.1959
Epoch 2 Batch 800 Loss 0.2328
Epoch 2 Batch 900 Loss 0.2332
Epoch 2 Batch 1000 Loss 0.2263
Epoch 2 Batch 1100 Loss 0.2026
Epoch 2 Batch 1200 Loss 0.2027
Epoch 2 Batch 1300 Loss 0.1991
Epoch 2 Batch 1400 Loss 0.1756
Epoch 2 Loss 0.2093
Time ta

## Transliterate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [22]:
arabic_diacritics = re.compile(
    """
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """,
    re.VERBOSE,
)

arabic_punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ"""
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
translator = str.maketrans("", "", punctuations_list)
unicode_chars = ["\u200c", "\u200e", "\u200f", "\u202c"]


def remove_problematic_unicode(text: str) -> str:
    for x in unicode_chars:
        text = text.replace(x, "")
    text = text.strip()
    return text


def normalize_arabic(text: str) -> str:
    text = re.sub("[إأآاٱ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text


def remove_diacritics(text: str) -> str:
    text = re.sub(arabic_diacritics, "", text)
    return text


def prepare_name(text: str) -> List[str]:
    text = text.lower()
    text = text.translate(translator)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    text = remove_problematic_unicode(text)
    return ["!" + e + "?" for e in text.split(" ")]


def evaluate(sentence):

    inputs = [inp_lang.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding="post"
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index["!"]], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out
        )

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + " "

        if targ_lang.index_word[predicted_id] == "$":
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    result = result.split("?")[0].replace(" ", "")

    return result, sentence


def transliterate(text: str) -> str:
    names = prepare_name(text)
    result = " ".join([evaluate(name)[0] for name in names])

    print("Input: %s" % (text))
    print("Predicted translation: {}".format(result))

    return result


## Restore the latest checkpoint and test

In [23]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb9d0f776d8>

In [30]:
names = {
    "Mohammed": "محمد‎",
    "Mamun": "مامون",
    "Urdu": "فیضان‎",
    "Thomas": "توماس",
    "Léna": "لينا",
    "Jean": "جينز",
    "Boubacar": "بوبكر",
    "Ghita": "غيتا",
    "Ezékiel": "حزقيال",
    "Gaspard": "جاسبارد",
    "Balthasar": "بالتازار",
    "Olivier": "أوليفر",
    "Jason": "جيسون",
    "Nicolas": "نيكولاس",
    "George": "جورج",
    "Joséphine": "جوزفين",
    "Cunégonde": "كونيجوند",
    "Hortense": "هورتنس",
}

for latin, arabic in names.items():
    transliterate(arabic)
    print(f"Ground truth         : {latin}")
    print("-------")

Input: محمد‎
Predicted translation: mohammad
Ground truth         : Mohammed
-------
Input: مامون
Predicted translation: mamon
Ground truth         : Mamun
-------
Input: فیضان‎
Predicted translation: faizan
Ground truth         : Urdu
-------
Input: توماس
Predicted translation: tomas
Ground truth         : Thomas
-------
Input: لينا
Predicted translation: lenna
Ground truth         : Léna
-------
Input: جينز
Predicted translation: jenz
Ground truth         : Jean
-------
Input: بوبكر
Predicted translation: bobaker
Ground truth         : Boubacar
-------
Input: غيتا
Predicted translation: gita
Ground truth         : Ghita
-------
Input: حزقيال
Predicted translation: hizqial
Ground truth         : Ezékiel
-------
Input: جاسبارد
Predicted translation: jasbard
Ground truth         : Gaspard
-------
Input: بالتازار
Predicted translation: baltazar
Ground truth         : Balthasar
-------
Input: أوليفر
Predicted translation: oliver
Ground truth         : Olivier
-------
Input: جيسون
Predicte

In [25]:
famous_arabs = {
    "Boutros Boutros-Ghali": "بطرس بطرس غالي",
    "Rifa'a al-Tahtawi": "رفاعة رافع الطهطاوي",
    "Saad Zaghloul": "سعد زغلول‎",
    "Farouk El-Baz": "فاروق الباز‎",
    "Abū ʿAbdallāh Yaʿīsh ibn Ibrāhīm ibn Yūsuf ibn Simāk al-Andalusī al-Umawī": "يعيش بن إبراهيم بن يوسف بن سماك الأموي الأندلسي",
    "Ahmed Hassan Zewail": "أحمد حسن زويل‎",
    "Abdel-Wahed El-Wakil": "عبد الواحد الوكيل‎",
    "Suad Amiry": "سعاد العامري‎",
    "Aḥmad ibn Faḍlān ibn al-ʿAbbās ibn Rāšid ibn Ḥammād": "أحمد بن فضلان بن العباس بن راشد بن حماد‎",
    "Ahmad ibn Mājid": "أحمد بن ماجد",
    "Abbas Mahmoud al-Aqqad": "عباس محمود العقاد‎",
    "Imru' al-Qais Junduh bin Hujr al-Kindi ": "ٱمْرُؤ ٱلْقَيْس جُنْدُح ٱبْن حُجْر ٱلْكِنْدِيّ‎",
    "Abū al-Qāsim Khalaf ibn al-'Abbās al-Zahrāwī al-Ansari": "أبو القاسم خلف بن العباس الزهراوي",
}

for latin, arabic in famous_arabs.items():
    transliterate(arabic)
    print(f"Ground truth         : {latin}")
    print("-------")

Input: بطرس بطرس غالي
Predicted translation: botros botros galli
Ground truth         : Boutros Boutros-Ghali
-------
Input: رفاعة رافع الطهطاوي
Predicted translation: refaa rafaa tahatawi
Ground truth         : Rifa'a al-Tahtawi
-------
Input: سعد زغلول‎
Predicted translation: saad zaghloul
Ground truth         : Saad Zaghloul
-------
Input: فاروق الباز‎
Predicted translation: farouq albaz
Ground truth         : Farouk El-Baz
-------
Input: يعيش بن إبراهيم بن يوسف بن سماك الأموي الأندلسي
Predicted translation: yaish ben abrahim ben yousuf ben smak alamoui andolsi
Ground truth         : Abū ʿAbdallāh Yaʿīsh ibn Ibrāhīm ibn Yūsuf ibn Simāk al-Andalusī al-Umawī
-------
Input: أحمد حسن زويل‎
Predicted translation: ahmed hassan zoel
Ground truth         : Ahmed Hassan Zewail
-------
Input: عبد الواحد الوكيل‎
Predicted translation: abder waahid alokil
Ground truth         : Abdel-Wahed El-Wakil
-------
Input: سعاد العامري‎
Predicted translation: suad amiri
Ground truth         : Suad Amiry


## Store everything to reproduce results elsewhere

In [26]:
def save_keras_tokenizer_json(tokenizer, path):
  # save keras tokenizer for input language (arabic)
  tokenizer_json = tokenizer.to_json()
  with io.open(path, 'w', encoding='utf-8') as f:
      f.write(json.dumps(tokenizer_json, ensure_ascii=False))

def load_keras_tokenizer_json(path):
  # load keras tokenizer for input language (arabic)
  with open(path) as f:
      data = json.load(f)
      tokenizer = tokenizer_from_json(data)
  return tokenizer

def save_metadata(metadata, path):
  with io.open(path, 'w', encoding='utf-8') as f:
      f.write(json.dumps(metadata, ensure_ascii=False))

def load_metadata(path):
  # load keras tokenizer for input language (arabic)
  with open(path) as f:
      data = json.load(f)
  return data


metadata = {
    "BATCH_SIZE": 64,
    "embedding_dim": 256,
    "units": 1024,
    "vocab_inp_size": len(inp_lang.word_index)+1,
    "vocab_tar_size": len(targ_lang.word_index)+1,
    "max_length_inp" : max_length_inp
}


# https://colab.research.google.com/drive/172D4jishSgE3N7AO6U2OKAA_0wNnrMOq#scrollTo=gMg87Tz01cxQ
encoder.save_weights("./model/encoder/checkpoint", save_format='tf')
decoder.save_weights("./model/decoder/checkpoint", save_format='tf')

save_keras_tokenizer_json(inp_lang, "./model/input_tokenizer.json")
save_keras_tokenizer_json(targ_lang, "./model/output_tokenizer.json")

save_metadata(metadata, "./model/ar2en_metadata.json")



## Recreate from scratch

In [27]:
# Check loading is working
input_tokenizer = load_keras_tokenizer_json("./model/input_tokenizer.json")
output_tokenizer = load_keras_tokenizer_json("./model/output_tokenizer.json")
loaded_metadata = load_metadata("./model/ar2en_metadata.json")
new_encoder, _, new_decoder = get_model(
    loaded_metadata["vocab_inp_size"],
    loaded_metadata["vocab_tar_size"],
    loaded_metadata["embedding_dim"],
    loaded_metadata["units"],
    loaded_metadata["BATCH_SIZE"],
)
new_encoder.load_weights("./model/encoder/checkpoint")
new_decoder.load_weights("./model/decoder/checkpoint")


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb962771e80>

In [28]:
def evaluate(sentence, input_tokenizer, output_tokenizer, encoder, decoder, metadata):

    inputs = [input_tokenizer.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=metadata["max_length_inp"], padding="post"
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    hidden = [tf.zeros((1, metadata["units"]))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([output_tokenizer.word_index["!"]], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out
        )

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += output_tokenizer.index_word[predicted_id] + " "

        if output_tokenizer.index_word[predicted_id] == "$":
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    result = result.split("?")[0].replace(" ", "")

    return result, sentence


def transliterate(text: str) -> str:
    names = prepare_name(text)
    result = " ".join([evaluate(name, input_tokenizer, output_tokenizer, encoder, decoder, metadata)[0] for name in names])

    print("Input: %s" % (text))
    print("Predicted translation: {}".format(result))

    return result

transliterate("محمد‎")

Input: محمد‎
Predicted translation: mohammad


'mohammad'