In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
ks = tf.keras

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/MyDrive/Sem_8/dl-datasets/dakshina_dataset_v1.0/hi/lexicons"

col_names = ['Dev.','Roman','att.']
train_data = np.array(pd.read_csv(path+"/hi.translit.sampled.train.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]
val_data = np.array(pd.read_csv(path+"/hi.translit.sampled.dev.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]
test_data = np.array(pd.read_csv(path+"/hi.translit.sampled.test.tsv", sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna())[:,:2]

In [None]:
input_vocab = set();
target_vocab = set();

for i in range(train_data.shape[0]):
    for char in train_data[i,0]:
        if char not in input_vocab:
            input_vocab.add(char)
    for char in train_data[i,1]:
        if char not in target_vocab:
            target_vocab.add(char)

input_vocab = sorted(list(input_vocab))
target_vocab = sorted(list(target_vocab))

len_input_vocab = len(input_vocab)
len_target_vocab = len(target_vocab)

input_dict = dict([ (char, i) for i, char in enumerate(input_vocab)])
target_dict = dict([ (char, i) for i, char in enumerate(target_vocab)])

In [None]:
X_train_idd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in train_data[:,0] ])
Y_train_idd = tf.ragged.constant([ [ target_dict[ch] for ch in word ] for word in train_data[:,1] ])

X_val_idd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in val_data[:,0] ])
Y_val_idd = tf.ragged.constant([ [ target_dict[ch] for ch in word ] for word in val_data[:,1] ])

X_test_idd = tf.ragged.constant([ [ input_dict[ch] for ch in word ] for word in test_data[:,0] ])
Y_test_idd = tf.ragged.constant([ [ target_dict[ch] for ch in word ] for word in test_data[:,1] ])

In [None]:
batch_id = 0
batch_size = 32
embed_size = 16

model_embed_input = ks.Sequential()
model_embed_input.add(ks.layers.Embedding(len_input_vocab, embed_size))
model_embed_input.compile('rmsprop','mse')

X_train_embed = model_embed_input.predict( X_train_idd[ batch_id*batch_size : (batch_id+1)*batch_size ] )

model_embed_target = ks.Sequential()
model_embed_target.add(ks.layers.Embedding(len_target_vocab, embed_size))
model_embed_target.compile('rmsprop','mse')

Y_train_embed = model_embed_target.predict( Y_train_idd[ batch_id*batch_size : (batch_id+1)*batch_size ] )

In [None]:
epochs = 100
latent_dim = 256

#Building the model

# Encoder
encoder_inputs = ks.Input(shape=(None, embed_size))
encoder = ks.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = ks.Input(shape=(None, embed_size))
decoder_lstm = ks.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = ks.layers.Dense(embed_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# The Model
model = ks.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
#Training the model
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
