In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!cd gdrive/My\ Drive

In [1]:
import tensorflow as tf 
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import string
import re

In [2]:
data_path = '/content/gdrive/My Drive/Hindi_English_Truncated_Corpus.csv'
df = pd.read_csv(data_path)

In [3]:
nan_values = df[df.isnull().any(axis=1)].index
df = df.drop(nan_values)

In [4]:
punc = set(string.punctuation)

In [5]:
#lower casing
df['english_sentence'] = df['english_sentence'].apply(lambda x:x.lower())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x:x.lower())
#removing digits
df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[0-9]", "", x))
#removing space
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
#removing punctuation
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: ''.join(char for char in x if char not in punc))
#adding start and end sequence
df['english_sentence'] = df['english_sentence'].apply(lambda x: 'start_ ' + x + ' _end')
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: 'start_ ' + x + ' _end')

In [6]:
english_data = df['english_sentence'].to_list()
hindi_data = df['hindi_sentence'].to_list()

In [7]:
#max length of sentences in each data
eng, hin = [], []
for l in english_data:
    eng.append(len(l.split(' ')))

for l in hindi_data:
    hin.append(len(l.split(' ')))

max_eng_sentence_length = 50 #max(eng)
max_hindi_sentence_length = 60 #max(hin)

In [8]:
english_tokenizer = Tokenizer(filters='')
hindi_tokenizer = Tokenizer(filters='')

english_tokenizer.fit_on_texts(english_data)
hindi_tokenizer.fit_on_texts(hindi_data)

#getting vocab sizes
english_vocab = len(english_tokenizer.word_index)+1
hindi_vocab = len(hindi_tokenizer.word_index)+1

#converting to sequences
english_encoded_text = english_tokenizer.texts_to_sequences(english_data)
hindi_encoded_text = hindi_tokenizer.texts_to_sequences(hindi_data)

#padding
english_padded_text = pad_sequences(english_encoded_text, maxlen=max_eng_sentence_length, padding='post')
hindi_padded_text = pad_sequences(hindi_encoded_text, maxlen=max_hindi_sentence_length, padding='post')

x_train, x_test, y_train, y_test = train_test_split(english_padded_text, hindi_padded_text, test_size=0.2)

In [9]:
#hyperparameters
units = 1024
dims = 512
max_positional_encoding = 10000

In [10]:
#positional encoding
def get_angles(position, i, dims):
    angle_rates = 1/np.power(10000, (2*(i//2))/np.float32(dims))
    return position*angle_rates

def positional_encoding(position, dims):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(dims)[np.newaxis, :], dims)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) #sin to even posi
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) #cos to odd posi
    position_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(position_encoding, dtype=tf.float32)

In [11]:
#Transformer architecture
def Encoder(vocab_size, dims, input_):

    embedding_layer = layers.Embedding(vocab_size, dims)(input_)
    #positional encoding
    scaling_factor = K.constant(np.sqrt(dims), shape=(1,1,1))
    x = layers.Multiply()([embedding_layer, scaling_factor])
    pos = positional_encoding(max_positional_encoding, dims)
    x = layers.Add()([x, pos[: , :tf.shape(x)[1], :]])
    #self-attention
    query_vector = layers.Dense(dims)(x)
    key_vector = layers.Dense(dims)(x)
    value_vector = layers.Dense(dims)(x)
    attention = layers.Attention()([query_vector, value_vector, key_vector])
    attention = layers.Dense(dims)(attention)
    x = layers.Add()([x, attention]) #residual connection
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    #feed forward neural network
    fully_connected = layers.Dense(units=units, activation='relu')(x)
    fully_connected = layers.Dense(units=dims)(fully_connected)
    x = layers.Add()([x, fully_connected]) #residual connection
    encoder = layers.LayerNormalization(epsilon=1e-6)(x)

    return encoder

def Decoder(vocab_size, dims, encoder, target):
    
    embedding_layer = layers.Embedding(vocab_size, dims)(target)
    #positional encoding
    scaling_factor = K.constant(np.sqrt(dims), shape=(1,1,1))
    x = layers.Multiply()([embedding_layer, scaling_factor])
    pos = positional_encoding(max_positional_encoding, dims)
    x = layers.Add()([x, pos[: , :tf.shape(x)[1], :]])
    #self-attention
    query_vector = layers.Dense(dims)(x)
    key_vector = layers.Dense(dims)(x)
    value_vector = layers.Dense(dims)(x)
    attention = layers.Attention(causal=True)([query_vector, value_vector, key_vector])
    attention = layers.Dense(dims)(attention)
    x = layers.Add()([x, attention]) #residual connection
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    #encoder-decoder attention
    query = layers.Dense(dims)(x)
    key = layers.Dense(dims)(encoder)
    value = layers.Dense(dims)(encoder)
    attention = layers.Attention()([query, value, key])
    attention = layers.Dense(dims)(attention)
    x = layers.Add()([x, attention]) #residual connection
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    #feed forward neural network
    fully_connected = layers.Dense(units=units, activation='relu')(x)
    fully_connected = layers.Dense(dims)(fully_connected)
    x = layers.Add()([x, fully_connected]) #residual connection
    decoder = layers.LayerNormalization(epsilon=1e-6)(x)
    decoder = layers.Dense(units=vocab_size)(decoder)

    return decoder

In [12]:
input_ = layers.Input(shape=(None,))
target = layers.Input(shape=(None,))
encoder = Encoder(english_vocab, dims, input_)
decoder = Decoder(hindi_vocab, dims, encoder, target)

In [13]:
model = Model(inputs=[input_, target], outputs=decoder)

In [None]:
plot_model(model, to_file='/content/gdrive/My Drive/model.png')

In [14]:
optimizer = Adam(0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss = SparseCategoricalCrossentropy(from_logits=True, reduction='none')
metrics = SparseCategoricalAccuracy()

In [15]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

In [20]:
checkpoint = ModelCheckpoint('/content/gdrive/My Drive/checkpoints/transformer.h5',
                             monitor='val_loss',
                             verbose=1,
                             save_best_only=True)

callbacks = [checkpoint]

In [None]:
hist = model.fit(x=[x_train, y_train[:, :-1]], y=y_train[:, 1:], batch_size=128, epochs=20, callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20