In [1]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

In [2]:
eng_tel=pd.read_csv('eng_tel.csv')
eng_tel.head()

Unnamed: 0,English,Telugu
0,politicians do not have permission to do what ...,రాజకీయ నాయకులకు చేయవలసినది చేయడానికి అనుమతి లేదు.
1,"I'd like to tell you about one such child,",అలాంటి ఒక పిల్లల గురించి నేను మీకు చెప్పాలనుకు...
2,This percentage is even greater than the perce...,ఈ శాతం భారతదేశంలో ఉన్న శాతం కంటే ఎక్కువ.
3,what we really mean is that they're bad at not...,మేము నిజంగా అర్థం ఏమిటంటే వారు శ్రద్ధ చూపకపోవడ...
4,.The ending portion of these Vedas is called U...,.ఈ వేదాల ముగింపు భాగాన్ని ఉపనిషత్తు అంటారు.


In [3]:
eng_tel.dropna(inplace=True)
eng_tel.shape

(5615, 2)

In [4]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [5]:
def preprocess(text):
    '''Function to preprocess English sentence'''
    text = text.lower() # lower casing
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [6]:
def preprocess_tel(text):
    '''Function to preprocess Telugu sentence'''
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [7]:
eng_tel['English'] = eng_tel['English'].apply(preprocess)
eng_tel['Telugu'] = eng_tel['Telugu'].apply(preprocess_tel)


eng_tel.head()

Unnamed: 0,English,Telugu
0,<start> politicians do not have permission to ...,<start> రాజకీయ నాయకులకు చేయవలసినది చేయడానికి అ...
1,<start> id like to tell you about one such chi...,<start> అలాంటి ఒక పిల్లల గురించి నేను మీకు చెప...
2,<start> this percentage is even greater than t...,<start> ఈ శాతం భారతదేశంలో ఉన్న శాతం కంటే ఎక్కు...
3,<start> what we really mean is that theyre bad...,<start> మేము నిజంగా అర్థం ఏమిటంటే వారు శ్రద్ధ ...
4,<start> the ending portion of these vedas is c...,<start> ఈ వేదాల ముగింపు భాగాన్ని ఉపనిషత్తు అంట...


In [8]:
def tokenize(lang):

  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=20,dtype='int32')

  return tensor, lang_tokenizer

In [9]:
def load_dataset():

  input_tensor, inp_lang_tokenizer = tokenize(eng_tel['English'].values)
  target_tensor, targ_lang_tokenizer = tokenize(eng_tel['Telugu'].values)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [10]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

In [11]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [12]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.15)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

4772 4772 843 843


In [13]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 128
units = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size =len(inp_lang.word_index.keys())
vocab_tar_size =len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size+1, 300))
for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer_encoder",trainable=False)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [17]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

                # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        x = self.embedding(x)
        
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [18]:
tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size+1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size+1, embedding_dim, units, BATCH_SIZE)

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    encoder.get_layer('embedding_layer_encoder').set_weights([embedding_matrix])
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [21]:
EPOCHS = 25

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 4.5265
Epoch 1 Batch 100 Loss 4.7038
Epoch 1 Batch 200 Loss 4.1336
Epoch 1 Loss 4.4060
Time taken for 1 epoch 49.53 sec

Epoch 2 Batch 0 Loss 4.2796
Epoch 2 Batch 100 Loss 4.5317
Epoch 2 Batch 200 Loss 3.3027
Epoch 2 Loss 4.0758
Time taken for 1 epoch 27.89 sec

Epoch 3 Batch 0 Loss 4.2790
Epoch 3 Batch 100 Loss 3.2386
Epoch 3 Batch 200 Loss 4.0803
Epoch 3 Loss 3.8952
Time taken for 1 epoch 27.90 sec

Epoch 4 Batch 0 Loss 3.6541
Epoch 4 Batch 100 Loss 3.5791
Epoch 4 Batch 200 Loss 4.0321
Epoch 4 Loss 3.7250
Time taken for 1 epoch 27.92 sec

Epoch 5 Batch 0 Loss 2.8685
Epoch 5 Batch 100 Loss 4.3204
Epoch 5 Batch 200 Loss 3.0883
Epoch 5 Loss 3.5277
Time taken for 1 epoch 27.81 sec

Epoch 6 Batch 0 Loss 2.6236
Epoch 6 Batch 100 Loss 3.3322
Epoch 6 Batch 200 Loss 3.0510
Epoch 6 Loss 3.2344
Time taken for 1 epoch 27.82 sec

Epoch 7 Batch 0 Loss 3.2872
Epoch 7 Batch 100 Loss 2.8411
Epoch 7 Batch 200 Loss 2.4500
Epoch 7 Loss 2.7985
Time taken for 1 epoch 27.96 sec

Epoch 

In [22]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=20, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result,attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result,attention_plot

In [23]:
input_sentence= 'please ensure that you use the appropriate form '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)

Input sentence in english :  please ensure that you use the appropriate form 
Predicted sentence in telugu :  దయచేసి మీరు తగిన ఫారమ్‌ను ఉపయోగిస్తున్నారని నిర్ధారించుకోండి <end> 


In [24]:
input_sentence='and do something with it to change the world '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)

Input sentence in english :  and do something with it to change the world 
Predicted sentence in telugu :  మరియు ప్రపంచాన్ని మార్చడానికి దానితో ఏదైనా చేయండి <end> 
