# **Machine Translation Model**

In [1]:
#@title Load Imports

import tensorflow as tf
import keras
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import json
import re
import pathlib

from google.colab import drive, files #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.lite.python import interpreter
from keras import layers
from keras.models import Model
from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
#@title Connect with Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#@title Git Clone
git_dir = '/content/IOH-Chat-App'
git_url = 'https://github.com/bangkit-team/IOH-chat-app.git'

if not os.path.exists(git_dir):
  subprocess.call(['git', 'clone', git_url])

In [4]:
filedir1 = '/content/IOH-chat-app/MachineLearning/datasets/translation/result/eng-ind.csv'
filedir2 = '/content/IOH-chat-app/MachineLearning/datasets/spam/emails.csv'

In [5]:
df1 = pd.read_csv(filedir1)
df1

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
15359,Limitation of this capability causes opportuni...,Keterbatasan kemampuan ini menyebabkan tertutu...
15360,Subjective approach evaluates poverty based on...,Pendekatan subyektif menilai kemiskinan berdas...
15361,"Limited sufficiency and food quality , seen fr...","terbatasnya kecukupan dan mutu pangan , diliha..."
15362,Around 20 percents people with the lowest inco...,Sekitar 20 persen penduduk dengan tingkat pend...


In [10]:
df2 = pd.read_csv(filedir2)
df2 = df2.drop(columns='spam')
df2 = df2.rename(columns={'text': 'English', 'teks': 'Indonesia'})
df2

Unnamed: 0,English,Indonesia
0,naturally irresistible your corporate identity...,Secara alami tak tertahankan identitas perusah...
1,the stock trading gunslinger fanny is merrill...,Fanny Gunslinger Perdagangan Saham adalah Merr...
2,unbelievable new homes made easy im wanting t...,Rumah Baru yang Luar Biasa Menjadi Mudah Saya ...
3,4 color printing special request additional i...,4 PERMINTAAN PERMINTAAN KHUSUS INFORMASI KHUSU...
4,"do not have money , get software cds from here...","Jangan punya uang, dapatkan CD perangkat lunak..."
...,...,...
5723,research and development charges to gpg here ...,Biaya penelitian dan pengembangan ke GPG di si...
5724,"receipts from visit jim , thanks again for t...","Tanda terima dari kunjungan Jim, terima kasih ..."
5725,enron case study update wow ! all on the same...,Pembaruan Studi Kasus Enron Wow! Semua pada ha...
5726,"interest david , please , call shirley crens...","Bunga David, tolong, hubungi Shirley Crenshaw ..."


In [13]:
df = pd.concat([df1, df2])
df

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
5723,research and development charges to gpg here ...,Biaya penelitian dan pengembangan ke GPG di si...
5724,"receipts from visit jim , thanks again for t...","Tanda terima dari kunjungan Jim, terima kasih ..."
5725,enron case study update wow ! all on the same...,Pembaruan Studi Kasus Enron Wow! Semua pada ha...
5726,"interest david , please , call shirley crens...","Bunga David, tolong, hubungi Shirley Crenshaw ..."


In [15]:
start_mark = '<start>'
end_mark = '<end>'

In [55]:
class TranslatorDataset():
  def __init__(self, dataframe):
    self.dataframe = dataframe
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = self.dataframe

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text, use_mark=False):
    if use_mark:
      text = text.strip()
      text = ' '.join([start_mark, text, end_mark])
    else:
      text = text.strip()

    return text

  def _tokenize(self, sentences, num_words, maxlen):
    punctuation = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

    tokenizer = Tokenizer(num_words=num_words, filters=punctuation)
    tokenizer.fit_on_texts(sentences)

    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(
        sequences, maxlen=self.maxlen, padding='post', truncating='post')

    return sequences, tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_sentence = np.array(
        list(map(lambda x: self._normalize_and_preprocess(x, False), input_lang)))
    
    target_sentence = np.array(
        list(map(lambda y: self._normalize_and_preprocess(y, True), target_lang)))
    
    return input_sentence, target_sentence

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = 20
    self.buffer_size = len(input_lang)

    input_sequences, input_tokenizer = self._tokenize(
        input_lang, num_words, self.maxlen)
    
    target_sequences, target_tokenizer = self._tokenize(
        target_lang, num_words, self.maxlen,)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def get(self, num_words, batch_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(self.buffer_size).batch(batch_size, drop_remainder=True)
    dataset = dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [72]:
num_words = 50000
batch_size = 64

In [73]:
translator_dataset = TranslatorDataset(df)

input_tokenizer, target_tokenizer, dataset = translator_dataset.get(num_words, batch_size)

In [74]:
input_batch, target_batch = next(iter(dataset))

In [75]:
input_batch.shape, target_batch.shape

(TensorShape([64, 20]), TensorShape([64, 20]))

In [76]:
input_vocab_size = len(input_tokenizer.index_word) + 1
target_vocab_size = len(target_tokenizer.index_word) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(20, 20, 41890, 43129)

In [77]:
input_example = input_batch[-1]
input_example

<tf.Tensor: shape=(20,), dtype=int32, numpy=
array([   8, 4190,  440,   41,   76,    9,  527,   10,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)>

In [78]:
target_example = target_batch[-1]
target_example

<tf.Tensor: shape=(20,), dtype=int32, numpy=
array([    7,   212,   303, 13305,     1, 25730,     8,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)>

In [79]:
input_sentence = input_tokenizer.sequences_to_texts([input_example.numpy()])[0]
input_sentence

"you haven't given me what i asked for"

In [80]:
target_sentence = target_tokenizer.sequences_to_texts([target_example.numpy()])[0]
target_sentence

'<start> kamu belum memberiku yang kuminta <end>'

In [81]:
embed_dims = 1000
units = 1024

In [82]:
class Encoder():
  def __init__(self, input_vocab_size, embedding_dims, units):
    self.units = units
    self.batch_size = batch_size
    self.input_vocab_size = input_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.input_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')

  def call(self, inputs):
    embedding = self.embedding(inputs)
    encoder = self.lstm_layer(embedding)

    return encoder

In [83]:
class BahdanauAttention(layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.w1 = layers.Dense(units, use_bias=True) 
    self.w2 = layers.Dense(units, use_bias=True) 
    self.fd = layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.fd(tanh(
        self.w1(query_with_time_axis) + self.w2(values)))

    attention_weights = softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [84]:
class Decoder():
  def __init__(self, output_vocab_size, embedding_dims, units):
    self.units = units
    self.output_vocab_size = output_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.output_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform')
    self.attention = BahdanauAttention(self.units)
    self.concat = layers.Concatenate()
    self.dense1 = layers.Dense(self.units, activation=tanh, use_bias=False)
    self.dropout = layers.Dropout(.5)
    self.dense2 = layers.TimeDistributed(layers.Dense(self.output_vocab_size))

  def call(self, inputs, en_outputs, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_h_state, dec_c_state = self.lstm_layer(
        embedding, initial_state=state)
    
    context_vector, attention_weights = self.attention(
        query=dec_outputs, values=en_outputs)
    
    context_and_rnn_output = self.concat(
        [context_vector, dec_outputs])

    attention_vector = self.dense1(context_and_rnn_output)
    outputs = self.dropout(attention_vector)
    outputs = self.dense2(outputs)

    return outputs

In [85]:
encoder = Encoder(input_vocab_size, embed_dims, units)
en_outputs, en_h_state, en_c_state = encoder.call(input_batch)

en_outputs.shape, en_h_state.shape, en_c_state.shape

(TensorShape([64, 20, 1024]), TensorShape([64, 1024]), TensorShape([64, 1024]))

In [87]:
decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs = decoder.call(target_batch, en_outputs, [en_h_state, en_c_state])

dec_outputs.shape

TensorShape([64, 20, 43129])

In [88]:
lr = 0.001
epochs = 30

optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [89]:
class TranslatorModel():
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units, 
               maxlen):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units
    self.maxlen = maxlen

    self.encoder = Encoder(
        self.input_vocab_size, self.embed_dims, self.units)
    
    self.decoder = Decoder(
        self.target_vocab_size, self.embed_dims, self.units)
  
  def build_model(self):
    en_inputs = layers.Input(shape=(self.maxlen,))

    en_output, en_h_state, en_c_state = self.encoder.call(en_inputs)

    dec_outputs = self.decoder.call(
        en_inputs, en_output, [en_h_state, en_c_state])

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [90]:
translator_model = TranslatorModel(
    input_vocab_size,
    target_vocab_size,
    embed_dims,
    units,
    input_maxlen
)
model = translator_model.build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
)

In [91]:
checkpoint_path = 'checkpoint/cp.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='loss', 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 20, 1000)     41890000    ['input_2[0][0]']                
                                                                                                  
 embedding_12 (Embedding)       (None, 20, 1000)     43129000    ['input_2[0][0]']                
                                                                                                  
 lstm_11 (LSTM)                 [(None, 20, 1024),   8294400     ['embedding_11[0][0]']           
                                 (None, 1024),                                              

In [92]:
model.fit(dataset,
          epochs=epochs,
          callbacks=callbacks,
          verbose=1)

Epoch 1/30
Epoch 1: loss improved from inf to 5.23998, saving model to checkpoint/cp.ckpt
Epoch 2/30
Epoch 2: loss improved from 5.23998 to 4.39015, saving model to checkpoint/cp.ckpt
Epoch 3/30
Epoch 3: loss improved from 4.39015 to 3.88074, saving model to checkpoint/cp.ckpt
Epoch 4/30
Epoch 4: loss improved from 3.88074 to 3.50607, saving model to checkpoint/cp.ckpt
Epoch 5/30
Epoch 5: loss improved from 3.50607 to 3.20443, saving model to checkpoint/cp.ckpt
Epoch 6/30
Epoch 6: loss improved from 3.20443 to 2.93823, saving model to checkpoint/cp.ckpt
Epoch 7/30
Epoch 7: loss improved from 2.93823 to 2.68346, saving model to checkpoint/cp.ckpt
Epoch 8/30
Epoch 8: loss improved from 2.68346 to 2.43242, saving model to checkpoint/cp.ckpt
Epoch 9/30
Epoch 9: loss improved from 2.43242 to 2.18344, saving model to checkpoint/cp.ckpt
Epoch 10/30
Epoch 10: loss improved from 2.18344 to 1.94064, saving model to checkpoint/cp.ckpt
Epoch 11/30
Epoch 11: loss improved from 1.94064 to 1.70868, s

<keras.callbacks.History at 0x7fe6fe375290>

In [93]:
saved_model_path  = '/content/drive/MyDrive/Company Case Bangkit/TranslationModel/saved_model'
saved_model_dir = os.path.dirname(saved_model_path)

if not os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
  
model.save(saved_model_path)



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Company Case Bangkit/TranslationModel/saved_model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Company Case Bangkit/TranslationModel/saved_model/assets


In [94]:
input_tokenizer_dir = '/content/drive/MyDrive/Company Case Bangkit/TranslationModel/input_tokenizer.json'

input_tokenizer_json = input_tokenizer.to_json()
with open(input_tokenizer_dir, 'w', encoding='utf-8') as f:
    json.dump(input_tokenizer_json, f, ensure_ascii=False)

files.download(input_tokenizer_dir)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [95]:
target_tokenizer_dir = '/content/drive/MyDrive/Company Case Bangkit/TranslationModel/target_tokenizer.json'

target_tokenizer_json = target_tokenizer.to_json()
with open(target_tokenizer_dir, 'w', encoding='utf-8') as f:
    json.dump(target_tokenizer_json, f, ensure_ascii=False)

files.download(target_tokenizer_dir)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [97]:
class Translator():
  def __init__(self, model_path, input_tokenizer_json, target_tokenizer_json, maxlen):
    self.model_path = model_path
    self.input_tokenizer_json = input_tokenizer_json
    self.target_tokenizer_json = target_tokenizer_json
    self.maxlen = maxlen

    self._load_model()
    self._load_tokenizer()

  def _load_model(self):
    self.model = tf.keras.models.load_model(self.model_path, compile=True)
  
  def _load_tokenizer(self):
    with open(self.input_tokenizer_json) as f:
      input_json = json.load(f)
      self.input_tokenizer = tokenizer_from_json(input_json)

    with open(self.target_tokenizer_json) as f:
      target_json = json.load(f)
      self.target_tokenizer = tokenizer_from_json(target_json)

  def _normalize_and_preprocess(self, text):
    punctuation = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
    
    text = text.lower().strip()
    text = ''.join((filter(lambda x: x not in punctuation, text)))

    return text

  def __call__(self, sentence):
    index_prediction = list()

    normalize_sentence = self._normalize_and_preprocess(sentence)
    sequences = self.input_tokenizer.texts_to_sequences([normalize_sentence])
    sequences = pad_sequences(
        sequences, maxlen=self.maxlen, padding="post", truncating="post")

    predictions = self.model(sequences)

    for i in predictions[0]:
        index_prediction.append(np.argmax(i))

    marks = [start_mark, end_mark]
    result = self.target_tokenizer.sequences_to_texts([index_prediction])[0]

    result = ' '.join([word for word in result.split(' ') if word not in marks])

    return result

In [98]:
saved_model_path = '/content/drive/MyDrive/Company Case Bangkit/TranslationModel/saved_model'

translator = Translator(
    saved_model_path,
    input_tokenizer_dir,
    target_tokenizer_dir,
    input_maxlen)

In [99]:
text_input = 'i like apple'

translate = translator(text_input)
print(translate)

aku suka apel


In [None]:
# converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
# converter.optimizations = [tf.lite.Optimize.DEFAULT]
# converter.target_spec.supported_ops = [
#   tf.lite.OpsSet.TFLITE_BUILTINS,
#   tf.lite.OpsSet.SELECT_TF_OPS
# ]
# converter.experimental_lower_tensor_list_ops = False

# tflite_model = converter.convert()

# tflite_model_file = pathlib.Path('translation.tflite')
# tflite_model_file.write_bytes(tflite_model)

# # files.download('translation.tflite')

In [None]:
# interpreter = tf.lite.Interpreter('translation.tflite')
# interpreter.allocate_tensors()

# input_details = interpreter.get_input_details()[0]
# output_details = interpreter.get_output_details()[0]

# input_details, output_details

In [None]:
# sequence = input_tokenizer.texts_to_sequences([text_input])
# pad_seqs = pad_sequences(
#     sequence, maxlen=input_maxlen, padding='post', truncating='post')

# input_data = pad_seqs.astype(np.float32)

# interpreter.set_tensor(input_details['index'], input_data)
# interpreter.invoke()

# predictions = interpreter.get_tensor(output_details['index'])
