In [82]:
import pandas as pd
import numpy as np
import tensorflow as tf
ks = tf.keras

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [84]:
basepath = "/content/drive/MyDrive/Sem_8/dl-datasets/dakshina_dataset_v1.0/hi/lexicons"

col_names = ['Dev.','Roman','att.']
STARTCHAR = '\t'
ENDCHAR   = '\n'

def read_as_array(path):
  data = pd.read_csv(path, sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna()
  data['Dev.'] = STARTCHAR + data['Dev.'] + ENDCHAR
  return np.array(data)[:,:2]

train_data = read_as_array(basepath+"/hi.translit.sampled.train.tsv")
val_data   = read_as_array(basepath+"/hi.translit.sampled.dev.tsv")
test_data  = read_as_array(basepath+"/hi.translit.sampled.test.tsv")

In [85]:
input_vocab = set()
target_vocab = set()

for i in range(len(train_data)):
  for char in train_data[i,1]:
    input_vocab.add(char)
  for char in train_data[i,0]:
    target_vocab.add(char)

input_vocab  = [''] + sorted(list(input_vocab))
target_vocab = [''] + sorted(list(target_vocab))

len_input_vocab  = len(input_vocab)
len_target_vocab = len(target_vocab)

input_dict  = dict([ (char, i) for i, char in enumerate(input_vocab)])
target_dict = dict([ (char, i) for i, char in enumerate(target_vocab)])

max_len_input  = max([ len(word) for data in [train_data[:,1], val_data[:,1], test_data[:,1]] for word in data ])
max_len_target = max([ len(word) for data in [train_data[:,0], val_data[:,0], test_data[:,0]] for word in data ])

In [93]:
def encode_char_to_num(data):
  enc_inp = np.zeros((data.shape[0], max_len_input), dtype="float32")
  dec_inp = np.zeros((data.shape[0], max_len_target), dtype="float32")
  dec_tgt = np.zeros((data.shape[0], max_len_target), dtype="float32")
  dec_tgt_onehot = np.zeros((data.shape[0], max_len_target, len_target_vocab), dtype="float32")

  for i, (target_word, input_word) in enumerate(data):
      for j, ch in enumerate(input_word):
          enc_inp[i,j] = input_dict[ch]
      for j, ch in enumerate(target_word):
          dec_inp[i,j] = target_dict[ch]
  dec_tgt[:,:-1] = dec_inp[:,1:]
  for i in range(len_target_vocab): dec_tgt_onehot[:,:,i] = dec_tgt[:,:]==i

  return enc_inp, dec_inp, dec_tgt_onehot

enc_in_train_idxd, dec_in_train_idxd, dec_targ_train_idxd = encode_char_to_num(train_data)
enc_in_val_idxd, dec_in_val_idxd, dec_targ_val_idxd       = encode_char_to_num(val_data)
enc_in_test_idxd, dec_in_test_idxd, dec_targ_test_idxd    = encode_char_to_num(test_data)

In [94]:
epochs = 100
latent_dim = 256
batch_size = 32
embed_size = 16

#Building the model

# Encoder
encoder_input = ks.Input(shape=(None,))
encoder_embedded = ks.layers.Embedding(len_input_vocab+1, embed_size, mask_zero=True)(encoder_input)
encoder_output, state_h, state_c = ks.layers.LSTM(latent_dim, return_state=True)(encoder_embedded)

encoder_states = [state_h, state_c]

# Decoder
decoder_input = ks.Input(shape=(None,))
decoder_embedded = ks.layers.Embedding(len_target_vocab+1, embed_size, mask_zero=True)(decoder_input)
decoder_lstm, _, _ = ks.layers.LSTM(latent_dim, return_sequences=True, return_state=True)(
    decoder_embedded, initial_state=encoder_states
)
decoder_output = ks.layers.Dense(len_target_vocab, activation="softmax")(decoder_lstm)

# The Model
model = ks.Model([encoder_input, decoder_input], decoder_output)

In [95]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, None, 16)     448         input_20[0][0]                   
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, None, 16)     1072        input_21[0][0]                   
____________________________________________________________________________________________

In [None]:
#Training the model
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

model.fit(
    [enc_in_train_idxd, dec_in_train_idxd],
    dec_targ_train_idxd,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=( [enc_in_val_idxd, dec_in_val_idxd], dec_targ_val_idxd )
)