In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
ks = tf.keras

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
path = "/content/drive/MyDrive/Sem_8/dl-datasets/dakshina_dataset_v1.0/hi/lexicons"

col_names = ['Dev.','Roman','att.']
train_data = np.array(pd.read_csv(path+"/hi.translit.sampled.train.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]
val_data = np.array(pd.read_csv(path+"/hi.translit.sampled.dev.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]
test_data = np.array(pd.read_csv(path+"/hi.translit.sampled.test.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]

In [14]:
input_vocab = set();
target_vocab = set();

for i in range(train_data.shape[0]):
    for char in train_data[i,0]:
        if char not in input_vocab:
            input_vocab.add(char)
    for char in train_data[i,1]:
        if char not in target_vocab:
            target_vocab.add(char)

input_vocab = sorted(list(input_vocab))
target_vocab = sorted(list(target_vocab))

len_input_vocab = len(input_vocab)
len_target_vocab = len(target_vocab)

input_dict = dict([ (char, i+1) for i, char in enumerate(input_vocab)])
target_dict = dict([ (char, i+1) for i, char in enumerate(target_vocab)])

max_len_input = max([ len(word) for data in [train_data[:,0], val_data[:,0], test_data[:,0]] for word in data ])
max_len_target = max([ len(word) for data in [train_data[:,1], val_data[:,1], test_data[:,1]] for word in data ])

In [4]:
enc_in_train_idxd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in train_data[:,0] ])
dec_in_train_idxd = [ [ target_dict[ch] for ch in word ] for word in train_data[:,1] ]
dec_targ_train_idxd = tf.ragged.constant([ word[1:] for word in dec_in_train_idxd])
dec_in_train_idxd = tf.ragged.constant(dec_in_train_idxd)

enc_in_val_idxd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in val_data[:,0] ])
dec_in_val_idxd = [ [ target_dict[ch] for ch in word ] for word in val_data[:,1] ]
dec_targ_val_idxd = tf.ragged.constant([ word[1:] for word in dec_in_val_idxd])
dec_in_val_idxd = tf.ragged.constant(dec_in_val_idxd)

enc_in_test_idxd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in test_data[:,0] ])
dec_in_test_idxd = [ [ target_dict[ch] for ch in word ] for word in test_data[:,1] ]
dec_targ_test_idxd = tf.ragged.constant([ word[1:] for word in dec_in_test_idxd])
dec_in_test_idxd = tf.ragged.constant(dec_in_test_idxd)

In [19]:
enc_in_train_idxd = np.zeros((train_data.shape[0], max_len_input), dtype="float32")
dec_in_train_idxd = np.zeros((train_data.shape[0], max_len_target), dtype="float32")
dec_targ_train_idxd = np.zeros((train_data.shape[0], max_len_target), dtype="float32")

enc_in_val_idxd = np.zeros((val_data.shape[0], max_len_input), dtype="float32")
dec_in_val_idxd = np.zeros((val_data.shape[0], max_len_target), dtype="float32")
dec_targ_val_idxd = np.zeros((val_data.shape[0], max_len_target), dtype="float32")

enc_in_test_idxd = np.zeros((test_data.shape[0], max_len_input), dtype="float32")
dec_in_test_idxd = np.zeros((test_data.shape[0], max_len_target), dtype="float32")
dec_targ_test_idxd = np.zeros((test_data.shape[0], max_len_target), dtype="float32")

for i, (input_word, target_word) in enumerate(zip(train_data[:,0], train_data[:,1])):
    for j, ch in enumerate(input_word):
        enc_in_train_idxd[i,j] = input_dict[ch]
    for j, ch in enumerate(target_word):
        dec_in_train_idxd[i,j] = target_dict[ch]
dec_targ_train_idxd[:,:-1] = dec_in_train_idxd[:,1:]

for i, (input_word, target_word) in enumerate(zip(val_data[:,0], val_data[:,1])):
    for j, ch in enumerate(input_word):
        enc_in_val_idxd[i,j] = input_dict[ch]
    for j, ch in enumerate(target_word):
        dec_in_val_idxd[i,j] = target_dict[ch]
dec_targ_val_idxd[:,:-1] = dec_in_val_idxd[:,1:]

for i, (input_word, target_word) in enumerate(zip(test_data[:,0], test_data[:,1])):
    for j, ch in enumerate(input_word):
        enc_in_test_idxd[i,j] = input_dict[ch]
    for j, ch in enumerate(target_word):
        dec_in_test_idxd[i,j] = target_dict[ch]
dec_targ_test_idxd[:,:-1] = dec_in_test_idxd[:,1:]

In [16]:
epochs = 100
latent_dim = 256
batch_size = 32
embed_size = 16

#Building the model

# Encoder
encoder_input = ks.Input(shape=(None,))
encoder_embedded = ks.layers.Embedding(len_input_vocab+1, embed_size, mask_zero=True)(encoder_input)
encoder_output, state_h, state_c = ks.layers.LSTM(latent_dim, return_state=True)(encoder_embedded)

encoder_states = [state_h, state_c]

# Decoder
decoder_input = ks.Input(shape=(None,))
decoder_embedded = ks.layers.Embedding(len_target_vocab+1, embed_size, mask_zero=True)(decoder_input)
decoder_lstm, _, _ = ks.layers.LSTM(latent_dim, return_sequences=True, return_state=True)(
    decoder_embedded, initial_state=encoder_states
)
decoder_output = ks.layers.Dense(embed_size, activation="softmax")(decoder_lstm)

# The Model
model = ks.Model([encoder_input, decoder_input], decoder_output)

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 16)     1008        input_6[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 16)     416         input_7[0][0]                    
______________________________________________________________________________________________

In [20]:
#Training the model
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

model.fit(
    [enc_in_train_idxd, dec_in_train_idxd],
    dec_targ_train_idxd,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=( [enc_in_val_idxd, dec_in_val_idxd], dec_targ_val_idxd )
)

Epoch 1/100


ValueError: ignored