In [13]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_excel('/content/dict.xlsx')

# Preprocess the data
english_sentences = data['English'].values
sanskrit_sentences = data['Sanskrit'].values

# Tokenize input sequences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_vocab_size = len(english_tokenizer.word_index) + 1
english_max_len = max([len(sentence.split()) for sentence in english_sentences])
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
english_padded_sequences = pad_sequences(english_sequences, maxlen=english_max_len, padding='post')

# Tokenize target sequences
sanskrit_tokenizer = Tokenizer()
sanskrit_tokenizer.fit_on_texts(sanskrit_sentences)
sanskrit_vocab_size = len(sanskrit_tokenizer.word_index) + 1
sanskrit_max_len = max([len(sentence.split()) for sentence in sanskrit_sentences])
sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences(sanskrit_sentences)
sanskrit_padded_sequences = pad_sequences(sanskrit_sequences, maxlen=sanskrit_max_len, padding='post')

# Define the model architecture
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(english_max_len,))
encoder_embedding = Embedding(english_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(sanskrit_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(sanskrit_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(english_padded_sequences, sanskrit_padded_sequences, test_size=0.1, random_state=42)

# Train the model
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:]), batch_size=64, epochs=20)

# Save the trained model
model.save('english_to_sanskrit_translation_model.h5')

# Save the tokenizers
import pickle
with open('english_tokenizer.pickle', 'wb') as handle:
    pickle.dump(english_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('sanskrit_tokenizer.pickle', 'wb') as handle:
    pickle.dump(sanskrit_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


In [22]:
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model
model = load_model('english_to_sanskrit_translation_model.h5')

# Load the tokenizers
with open('english_tokenizer.pickle', 'rb') as handle:
    english_tokenizer = pickle.load(handle)

with open('sanskrit_tokenizer.pickle', 'rb') as handle:
    sanskrit_tokenizer = pickle.load(handle)

# Define a function for translation
def translate_sentence(sentence):
    # Tokenize the input sentence
    seq = english_tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(seq, maxlen=english_max_len, padding='post')

    # Initialize the decoder input sequence with zeros
    decoder_input = np.zeros((1, 1))

    # Decode the input sequence
    output_sentence = []
    for _ in range(sanskrit_max_len):
        output_tokens = model.predict([padded_seq, decoder_input])
        token_index = np.argmax(output_tokens[0, -1, :])
        if token_index == 0:  # Padding token
            break
        output_sentence.append(sanskrit_tokenizer.index_word[token_index])

        # Update decoder input with predicted token for next iteration
        decoder_input[0, 0] = token_index

    return ' '.join(output_sentence)

# Example translation
english_sentence = "thought"
translated_sentence = translate_sentence(english_sentence)
print("Translated Sentence:", translated_sentence)





Translated Sentence: करोति
