In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np

file_path = '/kaggle/input/word-level/dict.csv'
file_path2='/kaggle/input/word-level/output_file.csv'
df1 = pd.read_csv(file_path)
df2=pd.read_csv(file_path2)

df1['Sanskrit'] = df1['Sanskrit'].apply(lambda x: x.lower().strip())
df1['English'] = df1['English'].apply(lambda x: x.lower().strip())

df2['Sanskrit'] = df2['Sanskrit'].apply(lambda x: x.lower().strip())
df2['English'] = df2['English'].apply(lambda x: x.lower().strip())



In [3]:
df = pd.concat([df1[['Sanskrit', 'English']], df2[['Sanskrit', 'English']]], ignore_index=True)

In [4]:
df

Unnamed: 0,Sanskrit,English
0,अहम्,i
1,माम्,me
2,त्वम्,you
3,गच्छ,go
4,अगच्छत्,went
...,...,...
110348,युयुजुह्,"offered , came down"
110349,युयुन्क्ससि,you want to pierce
110350,युयुत्सतम्,of those who are belligerent
110351,युयुत्सुह्,"the son of dhrtarastra by his vaisya wife , s..."


In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


In [7]:

sanskrit_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()


In [8]:

sanskrit_tokenizer.fit_on_texts(df['Sanskrit'])
english_tokenizer.fit_on_texts(df['English'])

In [9]:
sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences(df['Sanskrit'])
english_sequences = english_tokenizer.texts_to_sequences(df['English'])

In [10]:
# english_sequences

In [11]:
#To create padding of sequences
max_len_sanskrit = max(len(seq) for seq in sanskrit_sequences)
max_len_english = max(len(seq) for seq in english_sequences)

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sanskrit_padded = pad_sequences(sanskrit_sequences, maxlen=max_len_sanskrit, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_len_english, padding='post')

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sanskrit_padded, english_padded, test_size=0.2)


In [14]:
sanskrit_vocab_size = len(sanskrit_tokenizer.word_index) + 1
english_vocab_size = len(english_tokenizer.word_index) + 1

print(sanskrit_vocab_size, english_vocab_size)

40028 15158


In [15]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention,Concatenate
from tensorflow.keras.models import Model

In [16]:
latent_dim = 256

In [20]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(sanskrit_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_sequences=True, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [21]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

In [22]:
attention_layer = Attention(name='attention_layer')
attention_result = attention_layer([decoder_outputs, encoder_lstm])

In [24]:
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_result])

In [25]:
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

In [26]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [27]:
model

<Functional name=functional, built=True>

In [28]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [29]:
model.summary()

In [30]:
import numpy as np

y_train = np.expand_dims(y_train, -1)
y_test = np.expand_dims(y_test, -1)

In [31]:
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1451s[0m 1s/step - loss: 0.3943 - val_loss: 0.0479
Epoch 2/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1456s[0m 1s/step - loss: 0.0463 - val_loss: 0.0422
Epoch 3/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1460s[0m 1s/step - loss: 0.0408 - val_loss: 0.0381
Epoch 4/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1463s[0m 1s/step - loss: 0.0356 - val_loss: 0.0351
Epoch 5/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1526s[0m 1s/step - loss: 0.0316 - val_loss: 0.0329
Epoch 6/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1464s[0m 1s/step - loss: 0.0285 - val_loss: 0.0312
Epoch 7/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1466s[0m 1s/step - loss: 0.0250 - val_loss: 0.0297
Epoch 8/10
[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1485s[0m 1s/step - loss: 0.0219 - val_loss: 0.0286
Epoch 9/

<keras.src.callbacks.history.History at 0x7f6925543f40>

In [34]:
model.save('/kaggle/working/Word_level_LSTM.h5')