In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/word-level/dict.csv
/kaggle/input/word-level/output_file.csv
/kaggle/input/data-vedic/vedic.csv


In [2]:
import pandas as pd
import numpy as np

file_path = '/kaggle/input/word-level/dict.csv'
file_path2='/kaggle/input/word-level/output_file.csv'
file_path3='/kaggle/input/data-vedic/vedic.csv'
df1 = pd.read_csv(file_path)
df2=pd.read_csv(file_path2)
df3=pd.read_csv(file_path3)

df1['Sanskrit'] = df1['Sanskrit'].apply(lambda x: x.lower().strip())
df1['English'] = df1['English'].apply(lambda x: x.lower().strip())

df2['Sanskrit'] = df2['Sanskrit'].apply(lambda x: x.lower().strip())
df2['English'] = df2['English'].apply(lambda x: x.lower().strip())

df3['Sanskrit'] = df3['nagari'].apply(lambda x: x.lower().strip())
df3['English'] = df3['description'].apply(lambda x: x.lower().strip())

In [3]:
df = pd.concat([df1[['Sanskrit', 'English']],df3[['Sanskrit', 'English']], df2[['Sanskrit', 'English']]], ignore_index=True)

In [4]:
df

Unnamed: 0,Sanskrit,English
0,अहम्,i
1,माम्,me
2,त्वम्,you
3,गच्छ,go
4,अगच्छत्,went
...,...,...
111841,युयुजुह्,"offered , came down"
111842,युयुन्क्ससि,you want to pierce
111843,युयुत्सतम्,of those who are belligerent
111844,युयुत्सुह्,"the son of dhrtarastra by his vaisya wife , s..."


In [5]:
!pip install indic_transliteration

Collecting indic_transliteration
  Downloading indic_transliteration-2.3.61-py3-none-any.whl.metadata (1.4 kB)
Collecting backports.functools-lru-cache (from indic_transliteration)
  Downloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting roman (from indic_transliteration)
  Downloading roman-4.2-py3-none-any.whl.metadata (3.6 kB)
Downloading indic_transliteration-2.3.61-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/153.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Downloading roman-4.2-py3-none-any.whl (5.5 kB)
Installing collected packages: roman, backports.functools-lru-cache, indic_transliteration
Successfully installed backports.functools-lru-cache-2.0.0 indic_transliteration-2.3.61 roman-4.2


In [6]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def transliterate_text(sanskrit_text):
    transliterated_text = transliterate(sanskrit_text, sanscript.DEVANAGARI, sanscript.IAST)
    return transliterated_text

In [7]:
df['Transliterated_Sanskrit']=df['Sanskrit'].apply(transliterate_text)

In [8]:
df

Unnamed: 0,Sanskrit,English,Transliterated_Sanskrit
0,अहम्,i,aham
1,माम्,me,mām
2,त्वम्,you,tvam
3,गच्छ,go,gaccha
4,अगच्छत्,went,agacchat
...,...,...,...
111841,युयुजुह्,"offered , came down",yuyujuh
111842,युयुन्क्ससि,you want to pierce,yuyunksasi
111843,युयुत्सतम्,of those who are belligerent,yuyutsatam
111844,युयुत्सुह्,"the son of dhrtarastra by his vaisya wife , s...",yuyutsuh


In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, GRU, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Hyperparameters
embedding_dim = 256
filters = 64
kernel_size = 3
gru_units = 256
dropout_rate = 0.5
max_seq_length = 100  
vocab_size_sanskrit = 256  
vocab_size_english = 256  

transliterated_sanskrit_texts = df['Transliterated_Sanskrit'].values
english_texts = df['English'].values

# Build character-level tokenizer for transliterated Sanskrit and English
def build_char_tokenizer(texts):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(texts)
    return tokenizer

transliterated_sanskrit_tokenizer = build_char_tokenizer(transliterated_sanskrit_texts)
english_tokenizer = build_char_tokenizer(english_texts)

# Convert texts to sequences
transliterated_sanskrit_sequences = transliterated_sanskrit_tokenizer.texts_to_sequences(transliterated_sanskrit_texts)
english_sequences = english_tokenizer.texts_to_sequences(english_texts)

# Pad sequences
transliterated_sanskrit_padded = pad_sequences(transliterated_sanskrit_sequences, maxlen=max_seq_length, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_seq_length, padding='post')

# Build the model
input_layer = Input(shape=(max_seq_length,))  # Adjust input shape for character-level encoding

# Embedding layer
embedding_layer = Embedding(input_dim=vocab_size_sanskrit, output_dim=embedding_dim)(input_layer)

# CNN layer
cnn_layer = Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same')(embedding_layer)

# GRU layer
gru_layer = Bidirectional(GRU(gru_units, return_sequences=True))(cnn_layer)

# Dense layer for output
output_layer = TimeDistributed(Dense(vocab_size_english, activation='softmax'))(gru_layer)

# Create and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
# Convert target sequences to 3D array of shape (batch_size, sequence_length, 1)
english_padded = np.expand_dims(english_padded, -1)
model.fit(transliterated_sanskrit_padded, english_padded, batch_size=64, epochs=20, validation_split=0.2)


Epoch 1/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 46ms/step - accuracy: 0.6733 - loss: 1.4816 - val_accuracy: 0.6797 - val_loss: 1.3689
Epoch 2/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 45ms/step - accuracy: 0.6818 - loss: 1.3583 - val_accuracy: 0.6797 - val_loss: 1.3644
Epoch 3/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 45ms/step - accuracy: 0.6809 - loss: 1.3544 - val_accuracy: 0.6808 - val_loss: 1.3582
Epoch 4/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 45ms/step - accuracy: 0.6842 - loss: 1.3368 - val_accuracy: 0.6813 - val_loss: 1.3535
Epoch 5/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 45ms/step - accuracy: 0.6823 - loss: 1.3367 - val_accuracy: 0.6821 - val_loss: 1.3472
Epoch 6/20
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 45ms/step - accuracy: 0.6852 - loss: 1.3189 - val_accuracy: 0.6823 - val_loss: 1.3480
Epoc

<keras.src.callbacks.history.History at 0x7fc6a8482b30>

In [21]:
model.save('hybrid_gru1.h5')

In [18]:
# Example input text for prediction
input_text = "gacchat"

# Tokenize the input text
input_sequence = transliterated_sanskrit_tokenizer.texts_to_sequences([input_text])

# Pad the input sequence
input_padded = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')


In [19]:
# Make predictions
predictions = model.predict(input_padded)

# Convert predictions to characters
predicted_sequence = np.argmax(predictions, axis=-1)[0]

# Convert sequence of integers back to characters
predicted_text = ''.join([english_tokenizer.index_word.get(index, '') for index in predicted_sequence if index > 0])

print("Predicted Translation:", predicted_text)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Predicted Translation: fouthe      
