<a href="https://colab.research.google.com/github/sutharimanikanta/-technity-tasks-/blob/main/Language_Translation_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install --upgrade transformers

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# from transformers import pipeline

# # Load translation pipeline
# translator = pipeline("translation", model="Telugu-LLM-Labs/gemma_2b_hi_finetuned", tokenizer="Telugu-LLM-Labs/gemma_2b_hi_finetuned")

# # Example Telugu sentence
# telugu_text = "నేను ఊహించలేను."

# # Translate Telugu to English
# english_translation = translator(telugu_text, src_lang="te", tgt_lang="en")

# print(english_translation)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import tensorflow as tf
from tensorflow import keras

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer

# Download nltk resources (if not already downloaded)
nltk.download('punkt')

class Encoder(tf.keras.layers.Layer):
    def _init_(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self)._init_()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shape):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)

    def call(self, inputs):
        embeddings = self.embedding_layer(inputs)
        output, state = self.gru(embeddings)
        return output, state

class BahdanauAttention(tf.keras.layers.Layer):
    def _init_(self, units=128):
        super(BahdanauAttention, self)._init_()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # Expand dimension for broadcasting addition operation
        query_with_time_axis = tf.expand_dims(query, 1)

        # Calculate the attention scores
        scores = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(scores, axis=1)

        # Compute context vector
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

class Decoder(tf.keras.layers.Layer):
    def _init_(self, vocab_size=1000, embedding_size=128, units=128):
        super(Decoder, self)._init_()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Used for attention
        self.attention = BahdanauAttention(units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights
class AdditiveAttentionTranslator:
    encoder_input_words = 20
    vocab_size = 1000
    embedding_size = 128
    epochs = 10
    batch_size = 200
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
    loss_history = []
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)

        encode = encode = Encoder(vocab_size = self.vocab_size, embedding_size = self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        x_decoder = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size = self.embedding_size, vocab_size = self.vocab_size, words = self.encoder_input_words)((x_decoder[:,0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs = decode)
        return self.encoder.summary(), self.decoder.summary()

    def generate_random_data(self, instances = 1000, decoder_words = 10):
        X1, X2 = np.random.randint(self.vocab_size, size=(instances, self.encoder_input_words)), np.random.randint(self.vocab_size, size=(instances, decoder_words))
        Y = Y = np.eye(self.vocab_size)[np.random.choice(self.vocab_size, instances * decoder_words)].reshape(instances, decoder_words, self.vocab_size)
        self.X1, self.X2, self.Y = X1, X2, Y
        return X1, X2, Y

    def train_translator(self):
        tf.get_logger().setLevel('ERROR')

        optimizer, loss_fn = self.optimizer, self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = tf.shape(self.Y)[0]

        X1, X2, Y = self.X1, self.X2, self.Y

        self.get_enc_dec()
        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X1[batch : batch + batch_size]
                    x2_train = X2[batch : batch + batch_size]
                    y_train = Y[batch : batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):

                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(y_train[:, query_number], output)
                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count
            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, keys, query_start, query_size = None):
        if query_size == None:
            query_size = self.X2.shape[-1]
        H, state = self.encoder(keys)

        value = []
        state_steps = []
        value.append(int(query_start[0][0]))


        for query_number in range(query_size):
            output, state = self.decoder((query_start, state, H))
            query_start = np.argmax(output.numpy(), axis = -1)
            value.append(query_start[0])
            state_steps.append(state)

        return value, state_steps
    def preprocess_input_sentence(self, input_sentence):
        # Tokenize the input sentence using nltk word_tokenize
        tokens = word_tokenize(input_sentence)

        # Limit the number of tokens to the maximum input length
        max_input_length = self.encoder_input_words
        tokens = tokens[:max_input_length]

        # Convert tokens to indices using a pretrained tokenizer
        input_indices = self.tokenizer.convert_tokens_to_ids(tokens)

        return input_indices

    def postprocess_output(self, translated_indices):
        # Convert translated indices to tokens using the same tokenizer used for preprocessing
        translated_tokens = self.tokenizer.convert_ids_to_tokens(translated_indices)

        # Join tokens into a sentence
        translated_sentence = ' '.join(translated_tokens)

        return translated_sentence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


*A Gated Recurrent Unit (GRU) layer is a type of recurrent neural network (RNN) layer used for processing sequential data, such as text, time series, and audio. It is designed to address some of the limitations of traditional RNNs, particularly regarding the vanishing gradient problem and the difficulty in capturing long-range dependencies in sequences.

* i want use Transformer Encoder Layers

* Bi-directional RNNs:
Purpose: Bi-directional RNNs process sequences in both forward and backward directions.
Strengths:
Capture context from both past and future tokens.
Useful for tasks like part-of-speech tagging, named entity recognition, and sentiment analysis.
Handle sequential dependencies well.
Weaknesses:
Computationally expensive due to bidirectional processing.
Still suffer from vanishing gradient problems.
Example: Bidirectional LSTMs or GRUs.
* Transformer Networks:
Purpose: Transformers revolutionized NLP by introducing self-attention mechanisms.
Strengths:
Parallelizable, making them faster than RNNs.
Capture global context effectively.
State-of-the-art performance on various NLP benchmarks.
Weaknesses:
Require large amounts of data and computational resources.
Lack inherent sequential processing (no recurrence).
Example: BERT, GPT, and other transformer-based models.
When to Choose:

Bi-directional RNNs: Use when you need fine-grained sequential context and have limited data.
Transformers: Opt for transformers when you have abundant data, want to handle long-range dependencies, and aim for top-tier performance.

In [None]:
class Encoder(tf.keras.layers.Layer):# here we have passed the Layers so that we can coustomize it
  def __init__(self,vocab_size=1000,embedding_size=128):
    super(Encoder,self).__init__()
    self.vocab_size=vocab_size# indicate top most word underconsideration
    self.embedding_size=embedding_size #dim of word embeddings
  def build(self,input_shape):
    self.embedding_layer=tf.keras.layers.Embedding(self.vocab_size,self.embedding_size)
    self.gru=tf.keras.layers.GRU(self.embedding_size,return_sequence=True,return_state=True)
    self.bi=tf.keras.layers.Bidirectional(self.gru)
    #hidden layer or output of gru(embedding_size),GRU layer will output a sequence of vectors instead of just a single vector(,return_sequence),
    #the GRU layer will return both the sequence of outputs and the final hidden state.
    print()
  def call(self,inputs):
    words=inputs
    embeddings=self.embedding_layer(words)
    output_sequence, forward_state, backward_state = self.bi(embeddings)
    return (output_sequence, forward_state, backward_state)
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self,words=20,embedding_size=128):
    super(BahdanauAttention,self).__init__()
    self.words=words
    self.embedding_size=embedding_size
  def build(self,input_shapes):
    # w2 will hold the trainable weights which we add next to the our coustom layers
    # shape for weighted matrix
    #initial values of the weight matrix W2 will be randomly sampled from a uniform distribution Using a random initialization is common practice in deep learning
    # to prevent the weights from being stuck in a symmetric or zero-initialized state.
    self.W1=self.add_weight(shape=(1,self.embedding_size),initializer="random_uniform")
    self.W2 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
    self.W3 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
    self.W4 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
    print()
  def call(self,inputs):
    query,value=inputs
    #attention mechanism,o selectively focus on different parts of the input sequence when producing each element of the output sequence. It allows the model to
    # weigh the relevance of each input element dynamically based on the context provided by the current state of the model.
    regressed_query=tf.einsum("bi,ci->bi",query,self.w1)
    regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)
    # While tf.einsum can indeed be used to compute dot products, it is a more general-purpose function that can perform a wide range of tensor operations beyond just dot products
    sum_query_value=tf.einsum("bij,ij->bij", regressed_query,regressed_value)
    # hyperbolic tanget function which are from neural network
    sum_of_query_value=tf.nn.tanh(sum_query_value)
    a=tf.einsum("bij,ij->bij",sum_of_query_value,self.W3)
    # sum of elements in specified axis
    a=tf.math.reduce_sum(a,axis=-1)
    #sum of elements along the specified axis of the tensor a
    a=tf.nn.softmax(a)
    context = tf.einsum("bi, bij -> bij", a, value)
    context = tf.reduce_sum(context, axis = 1)


    return context
class AdditiveAttentionTranslator:
    # Setting up some parameters for the translator
    encoder_input_words = 20  # Number of words in the input sequence
    vocab_size = 1000  # Size of the vocabulary
    embedding_size = 128  # Size of the word embeddings
    epochs = 30  # Number of training epochs
    batch_size = 200  # Number of samples processed in each training iteration
    optimizer = tf.keras.optimizers.Adam()  # Optimizer used for training
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # Loss function used for training
    loss_history = []  # List to store the training loss history

    # Function to create encoder and decoder models
    def get_enc_dec(self):
        # Define input for encoder
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
        # Create an encoder using the defined input shape
        encode = Encoder(vocab_size=self.vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        # Define the encoder model
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        # Define input for decoder
        x_decoder_input = tf.keras.layers.Input(1)
        # Create embeddings for decoder input
        x_decoder = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)(x_decoder_input)
        # Define input states for decoder
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))
        # Create a decoder using the defined inputs
        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.vocab_size, words=self.encoder_input_words)(
            (x_decoder[:, 0], x_state_input, x_states_input))
        # Define the decoder model
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        # Return summaries of both encoder and decoder
        return self.encoder.summary(), self.decoder.summary()

    # Function to generate random training data
    def generate_random_data(self, instances=1000, decoder_words=10):
        # Generate random input sequences and output sequences
        X1, X2 = np.random.randint(self.vocab_size, size=(instances, self.encoder_input_words)), np.random.randint(
            self.vocab_size, size=(instances, decoder_words))
        # Generate random labels
        Y = np.eye(self.vocab_size)[
            np.random.choice(self.vocab_size, instances * decoder_words)].reshape(instances, decoder_words,
                                                                                   self.vocab_size)
        # Store generated data
        self.X1, self.X2, self.Y = X1, X2, Y
        # Return generated data
        return X1, X2, Y

    # Function to train the translator
    def train_translator(self):
        # Set logging level
        tf.get_logger().setLevel('ERROR')

        # Get optimizer and loss function
        optimizer, loss_fn = self.optimizer, self.loss_fn

        # Get training parameters
        epochs, batch_size = self.epochs, self.batch_size
        total_instances = tf.shape(self.Y)[0]

        X1, X2, Y = self.X1, self.X2, self.Y

        # Initialize loss history
        self.loss_history = []

        # Iterate over epochs
        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            # Iterate over batches
            for batch in tqdm(range(0, total_instances, batch_size)):
                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X1[batch:batch + batch_size]
                    x2_train = X2[batch:batch + batch_size]
                    y_train = Y[batch:batch + batch_size]

                    # Encode input sequences
                    H, state = self.encoder(x1_train)

                    # Decode output sequences
                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(y_train[:, query_number], output)
                # Calculate gradients and update weights
                grads = tape.gradient(loss_count,
                                      self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(
                    zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count
            # Print epoch information
            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    # Function to translate a given sentence
    def translate_sentence(self, keys, query_start, query_size=None):
        if query_size == None:
            query_size = self.X2.shape[-1]
        # Encode input sequence
        H, state = self.encoder(keys)

        value = []
        state_steps = []
        value.append(int(query_start[0][0]))

        # Decode output sequence
        for query_number in range(query_size):
            output, state = self.decoder((query_start, state, H))
            query_start = np.argmax(output.numpy(), axis=-1)
            value.append(query_start[0])
            state_steps.append(state)

        return value, state_steps

class Decoder(tf.keras.layers.Layer):
  def __init__(self,embedding_size=128,vocab_size=1000,words=20):
    super(Decoder,self).__init()
    self.embedding_size=embedding_size
    self.vocab_size=vocab_size
    self.words=words
  def build(self,input_shapes):
    self.attention=BahdanauAttention(words = self.words, embedding_size = self.embedding_size)
    self.gru = tf.keras.layers.GRU(self.embedding_size)
    self.bi=tf.keras.layers.Bidirectional(self.gru,)
    self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation = 'tanh')
    self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation = 'tanh')
    self.op3 = tf.keras.layers.Dense(self.vocab_size, activation = 'softmax')
    print()
  def call(self,inputs):
    y,state,encode=inputs
    context=self.attention((state,encode))
    state_expanded = tf.expand_dims(state, axis = 1)
    context_expanded = tf.expand_dims(context, axis = 1)
    y_expanded = tf.expand_dims(y, axis = 1)

    gru1_input = tf.concat([state_expanded, context_expanded], axis = 1)
    gru1_input2 = tf.concat([gru1_input, y_expanded], axis = 1)
    new_state =  self.bi(gru1_input2)
    g_input = tf.concat([tf.concat([y, context], axis = -1), new_state], axis = -1)
    g_output = self.op3(self.op2(self.op1(g_input)))

    return g_output, new_state



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Embedding

class EnglishToTeluguTranslator:
    def __init__(self, encoder_input_words=20, english_vocab_size=1000, telugu_vocab_size=1000, embedding_size=128,
                 epochs=30, batch_size=200, optimizer='adam'):
        self.encoder_input_words = encoder_input_words
        self.english_vocab_size = english_vocab_size
        self.telugu_vocab_size = telugu_vocab_size
        self.embedding_size = embedding_size
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.loss_fn = CategoricalCrossentropy(from_logits=True)
        self.loss_history = []
        self.encoder = encoder_model
        self.decoder = None

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
        encode = Encoder(vocab_size=self.english_vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        x_decoder = Embedding(self.telugu_vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.telugu_vocab_size,
                         words=self.encoder_input_words)((x_decoder[:, 0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def train_translator(self, X_english, X_telugu):
        optimizer = tf.keras.optimizers.Adam()  # You can change the optimizer here if needed
        loss_fn = self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = len(X_english)

        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X_english[batch:batch + batch_size]
                    x2_train = X_telugu[batch:batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(x2_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count

            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, english_sentence):
        # Assuming you have a tokenizer initialized and fit on your English text data
        # english_indices = your_tokenizer.texts_to_sequences([english_sentence])[0]
        # english_indices = np.array([english_indices])

        english_indices = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])  # Example input, replace with actual indices

        H, state = self.encoder(english_indices)

        telugu_sentence = []

        start_token = np.array([2])  # Replace 2 with the actual index of the start token in the Telugu vocabulary
        start_token = tf.expand_dims(start_token, axis=0)

        for _ in range(10):  # Replace 10 with the actual maximum length of the Telugu sentence you want to generate
            output, state = self.decoder((start_token, state, H))
            predicted_word_index = np.argmax(output.numpy(), axis=-1)
            telugu_sentence.append(predicted_word_index[0][0])

            if predicted_word_index[0][0] == 3:  # Replace 3 with the actual index of the end token in the Telugu vocabulary
                break

            start_token = predicted_word_index

        return telugu_sentence


* Luong Attention (Scaled Dot-Product Attention):
* Self-Attention (Scaled Dot-Product Attention):

In [None]:
# Step 1: Parse the text data
with open("english_telugu_data.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

english_sentences = []
telugu_sentences = []
for line in lines:
    english, telugu = line.strip().split("++++$++++")
    english_sentences.append(english.strip())
    telugu_sentences.append(telugu.strip())

# Step 2: Tokenization
english_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
telugu_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

english_tokenizer.fit_on_texts(english_sentences)
telugu_tokenizer.fit_on_texts(telugu_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
telugu_sequences = telugu_tokenizer.texts_to_sequences(telugu_sentences)

# Step 3: Padding
max_length = max(max(len(seq) for seq in english_sequences), max(len(seq) for seq in telugu_sequences))
english_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, maxlen=max_length, padding='post')
telugu_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(telugu_sequences, maxlen=max_length, padding='post')

# Step 4: Data Splitting
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(english_sequences_padded, telugu_sequences_padded, test_size=0.2, random_state=42)

# Step 5: Model Training
translator = EnglishToTeluguTranslator()
translator.train_translator(X_train, y_train)

# Step 6: Evaluation (Optional)
# Evaluate your model's performance on the validation set using metrics like BLEU score or simply by inspecting translations manually.

# Step 7: Inference
english_sentence = "His legs are long."
english_sequence = english_tokenizer.texts_to_sequences([english_sentence])
translated_sequence = translator.translate_sentence(english_sequence)
translated_sentence = telugu_tokenizer.sequences_to_texts([translated_sequence])[0]
print("Translated Sentence:", translated_sentence)


FileNotFoundError: [Errno 2] No such file or directory: 'english_telugu_data.txt'

In [None]:
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.layers import Embedding, GRU, Dense
# from tensorflow.keras.losses import CategoricalCrossentropy
# from tqdm import tqdm

# class Encoder(tf.keras.layers.Layer):
#     def __init__(self, vocab_size=1000, embedding_size=128):
#         super(Encoder, self).__init__()
#         self.vocab_size = vocab_size
#         self.embedding_size = embedding_size

#     def build(self, input_shapes):
#         self.embedding_layer = Embedding(self.vocab_size, self.embedding_size)
#         self.gru = GRU(self.embedding_size, return_sequences=True, return_state=True)

#     def call(self, inputs):
#         words = inputs
#         embeddings = self.embedding_layer(words)
#         output, state = self.gru(embeddings)
#         return output, state

# class BahdanauAttention(tf.keras.layers.Layer):
#     def __init__(self, words=20, embedding_size=128):
#         super(BahdanauAttention, self).__init__()
#         self.words = words
#         self.embedding_size = embedding_size

#     def build(self, input_shapes):
#         self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
#         self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
#         self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
#         self.W4 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

#     def call(self, inputs):
#         query, value = inputs

#         regressed_query = tf.einsum("bi,ci -> bi", query, self.W1)
#         regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)

#         sum_query_value = tf.einsum("bi, bji -> bji", regressed_query, regressed_value)
#         sum_of_query_value = tf.nn.tanh(sum_query_value)

#         a = tf.einsum("bij, ij -> bij", sum_of_query_value, self.W3)
#         a = tf.math.reduce_sum(a, axis=-1)
#         a = tf.nn.softmax(a)

#         context = tf.einsum("bi, bij -> bij", a, value)
#         context = tf.reduce_sum(context, axis=1)

#         return context

# class Decoder(tf.keras.layers.Layer):
#     def __init__(self, embedding_size=128, vocab_size=1000, words=20):
#         super(Decoder, self).__init__()
#         self.embedding_size = embedding_size
#         self.vocab_size = vocab_size
#         self.words = words

#     def build(self, input_shapes):
#         self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
#         self.gru = GRU(self.embedding_size)
#         self.op1 = Dense(self.embedding_size * 10, activation='tanh')
#         self.op2 = Dense(self.embedding_size * 10, activation='tanh')
#         self.op3 = Dense(self.vocab_size, activation='softmax')

#     def call(self, inputs):
#         y, state, encode = inputs

#         context = self.attention((state, encode))

#         state_expanded = tf.expand_dims(state, axis=1)
#         context_expanded = tf.expand_dims(context, axis=1)
#         y_expanded = tf.expand_dims(y, axis=1)

#         gru1_input = tf.concat([state_expanded, context_expanded], axis=1)
#         gru1_input2 = tf.concat([gru1_input, y_expanded], axis=1)

#         new_state = self.gru(gru1_input2)

#         g_input = tf.concat([tf.concat([y, context], axis=-1), new_state], axis=-1)
#         g_output = self.op3(self.op2(self.op1(g_input)))

#         return g_output, new_state

# class EnglishToTeluguTranslator:
#     def __init__(self, encoder_input_words=20, english_vocab_size=1000, telugu_vocab_size=1000, embedding_size=128,
#                  epochs=30, batch_size=200, optimizer='adam'):
#         self.encoder_input_words = encoder_input_words
#         self.english_vocab_size = english_vocab_size
#         self.telugu_vocab_size = telugu_vocab_size
#         self.embedding_size = embedding_size
#         self.epochs = epochs
#         self.batch_size = batch_size
#         self.optimizer = optimizer
#         self.loss_fn = CategoricalCrossentropy(from_logits=True)
#         self.loss_history = []
#         self.encoder = None
#         self.decoder = None

#     def get_enc_dec(self):
#         x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
#         encode = Encoder(vocab_size=self.english_vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
#         self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

#         x_decoder_input = tf.keras.layers.Input(1)
#         x_decoder = Embedding(self.telugu_vocab_size, self.embedding_size)(x_decoder_input)
#         x_state_input = tf.keras.layers.Input(self.embedding_size)
#         x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

#         decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.telugu_vocab_size,
#                          words=self.encoder_input_words)((x_decoder[:, 0], x_state_input, x_states_input))
#         self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
#         return self.encoder.summary(), self.decoder.summary()

#     def train_translator(self, X_english, X_telugu):
#         optimizer = tf.keras.optimizers.Adam()  # You can change the optimizer here if needed
#         loss_fn = self.loss_fn

#         epochs, batch_size = self.epochs, self.batch_size
#         total_instances = len(X_english)

#         self.loss_history = []

#         for epoch in range(epochs):
#             batch_loss = tf.constant(0.0)
#             for batch in tqdm(range(0, total_instances, batch_size)):

#                 with tf.GradientTape() as tape:
#                     loss_count = tf.constant(0.0)
#                     x1_train = X_english[batch:batch + batch_size]
#                     x2_train = X_telugu[batch:batch + batch_size]

#                     H, state = self.encoder(x1_train)

#                     for query_number in range(x2_train.shape[-1]):
#                         output, state = self.decoder((x2_train[:, query_number], state, H))
#                         loss_count = loss_count + loss_fn(x2_train[:, query_number], output)

#                 grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
#                 optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
#                 batch_loss = batch_loss + loss_count

#             print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
#             self.loss_history.append(batch_loss.numpy())

#     def translate_sentence(self, english_sentence):
#         english_indices = your_tokenizer.texts_to_sequences([english_sentence])[0]
#         english_indices = np.array([english_indices])
#         H, state = self.encoder(x1_train, initial_state=self.encoder.initial_state)
#         telugu_sentence = []

#         start_token = np.array([2])  # Replace 2 with the actual index of the start token in the Telugu vocabulary
#         start_token = tf.expand_dims(start_token, axis=0)

#         for _ in range(10):  # Replace 10 with the actual maximum length of the Telugu sentence you want to generate
#             output, state = self.decoder((start_token, state, H))
#             predicted_word_index = np.argmax(output.numpy(), axis=-1)
#             telugu_sentence.append(predicted_word_index[0][0])

#             if predicted_word_index[0][0] == 3:  # Replace 3 with the actual index of the end token in the Telugu vocabulary
#                 break

#             start_token = predicted_word_index

#         return telugu_sentence



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shape):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)
        self.bi = tf.keras.layers.Bidirectional(self.gru)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output_sequence, forward_state, backward_state = self.bi(embeddings)
        return (output_sequence, forward_state, backward_state)

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W4 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs
        regressed_query = tf.einsum("bi,ci->bi", query, self.W1)
        regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)
        sum_query_value = tf.einsum("bij,ij->bij", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)
        a = tf.einsum("bij,ij->bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)
        context = tf.einsum("bi, bij -> bij", a, value)
        context = tf.reduce_sum(context, axis=1)
        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size)
        self.bi = tf.keras.layers.Bidirectional(self.gru)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs
        context = self.attention((state, encode))
        state_expanded = tf.expand_dims(state, axis=1)
        context_expanded = tf.expand_dims(context, axis=1)
        y_expanded = tf.expand_dims(y, axis=1)
        gru1_input = tf.concat([state_expanded, context_expanded], axis=1)
        gru1_input2 = tf.concat([gru1_input, y_expanded], axis=1)
        new_state = self.bi(gru1_input2)
        g_input = tf.concat([tf.concat([y, context], axis=-1), new_state], axis=-1)
        g_output = self.op3(self.op2(self.op1(g_input)))
        return g_output, new_state

# Define the Encoder class and other custom layers as given in the code snippet

# Define the EnglishToTeluguTranslator class with corrections
class EnglishToTeluguTranslator:
    def __init__(self, encoder_input_words=20, english_vocab_size=1000, telugu_vocab_size=1000, embedding_size=128,
                 epochs=30, batch_size=200, optimizer='adam'):
        self.encoder_input_words = encoder_input_words
        self.english_vocab_size = english_vocab_size
        self.telugu_vocab_size = telugu_vocab_size
        self.embedding_size = embedding_size
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.loss_fn = CategoricalCrossentropy(from_logits=True)
        self.loss_history = []
        self.encoder = None
        self.decoder = None
        self.english_tokenizer = Tokenizer(num_words=english_vocab_size, oov_token='<OOV>')
        self.telugu_tokenizer = Tokenizer(num_words=telugu_vocab_size, oov_token='<OOV>')

    def tokenize_sentences(self, english_sentences, telugu_sentences):
        english_sentences = [str(sentence) for sentence in english_sentences]
        telugu_sentences = [str(sentence) for sentence in telugu_sentences]

        self.english_tokenizer.fit_on_texts(english_sentences)
        self.telugu_tokenizer.fit_on_texts(telugu_sentences)

        X_english = self.english_tokenizer.texts_to_sequences(english_sentences)
        X_telugu = self.telugu_tokenizer.texts_to_sequences(telugu_sentences)

        return X_english, X_telugu

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(shape=(None,))
        encode = Encoder(vocab_size=self.english_vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(shape=(1,))
        x_decoder = Embedding(self.telugu_vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(shape=(self.embedding_size,))
        x_states_input = tf.keras.layers.Input(shape=(self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.telugu_vocab_size,
                         words=self.encoder_input_words)((x_decoder[:, 0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def train_translator(self, X_english, X_telugu):
        optimizer = tf.keras.optimizers.Adam()
        loss_fn = self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = len(X_english)

        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X_english[batch:batch + batch_size]
                    x2_train = X_telugu[batch:batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(x2_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count

            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, english_sentence):
        english_indices = self.english_tokenizer.texts_to_sequences([english_sentence])
        english_indices = np.array(english_indices)

        H, state = self.encoder(english_indices)

        telugu_sentence = []

        start_token = np.array([[2]])  # Start token index
        start_token = tf.expand_dims(start_token, axis=0)

        for _ in range(10):  # Max length of the Telugu sentence
            output, state = self.decoder((start_token, state, H))
            predicted_word_index = np.argmax(output.numpy(), axis=-1)
            telugu_sentence.append(predicted_word_index[0][0])

            if predicted_word_index[0][0] == 3:  # End token index
                break

            start_token = predicted_word_index

        return telugu_sentence

# Load the dataset

# Initialize the translator
translator = EnglishToTeluguTranslator()

# Initialize encoder and decoder
translator.get_enc_dec()

# Tokenize sentences
X_train_eng_seq, X_train_tel_seq = translator.tokenize_sentences(X_train_eng, X_train_tel)

# Train the translator
translator.train_translator(X_train_eng_seq, X_train_tel_seq)

# Translate a sample English sentence
sample_english_sentence = "Hello, how are you?"
translated_telugu_sentence = translator.translate_sentence(sample_english_sentence)
print("Translated Telugu sentence:", translated_telugu_sentence)


NameError: name 'Tokenizer' is not defined

In [None]:
# Load the dataset
data = pd.read_excel("/content/engtotel.xlsx")

# Split the dataset into English and Telugu sentences
english_sentences = data["english"].values
telugu_sentences = data["telugu"].values

# Tokenize the English sentences (assuming you have a tokenizer)
# english_tokenizer = ...

# Convert English sentences to sequences of indices
# X_english = english_tokenizer.texts_to_sequences(english_sentences)

# Assume X_english is ready with English sequences, similarly prepare X_telugu
english_sentences = data["english"].values
telugu_sentences = data["telugu"].values

# Tokenize the English sentences (assuming you have a tokenizer)
# english_tokenizer = ...

# Convert English sentences to sequences of indices
X_english = english_tokenizer.texts_to_sequences(english_sentences)

# Assume X_english is ready with English sequences, similarly prepare X_telugu
X_telugu= english_tokenizer.texts_to_sequences(english_sentences)
# Split the data into training and testing sets
X_train_eng, X_test_eng, X_train_tel, X_test_tel = train_test_split(X_english, X_telugu, test_size=0.2, random_state=42)

# Initialize the translator
translator = EnglishToTeluguTranslator()

# Train the translator
translator.train_translator(X_train_eng, X_train_tel)

# Translate a sample English sentence
sample_english_sentence = "Hello, how are you?"
translated_telugu_sentence = translator.translate_sentence(sample_english_sentence)
print("Translated Telugu sentence:", translated_telugu_sentence)

In [None]:
data = pd.read_excel("/content/engtotel.xlsx")
english_sentences = data["english"].values
telugu_sentences = data["telugu"].values

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer




In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shape):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)
        self.bi = tf.keras.layers.Bidirectional(self.gru)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output_sequence, forward_state, backward_state = self.bi(embeddings)
        return (output_sequence, forward_state, backward_state)

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs
        regressed_query = tf.einsum("bi,ci->bi", query, self.W1)
        regressed_value = tf.einsum("bij,ij->bij", value, self.W2)
        sum_query_value = tf.einsum("bij,ij->bij", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)
        a = tf.einsum("bij,ij->bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)
        context = tf.einsum("bi,bij->bj", a, value)
        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs
        context = self.attention((state, encode))
        y = tf.expand_dims(y, axis=1)
        y_context = tf.concat([y, context], axis=-1)
        gru_input = tf.concat([y_context, state[:, None]], axis=-1)
        gru_output, new_state = self.gru(gru_input)
        output = self.op3(self.op2(self.op1(gru_output)))
        return output, new_state

class EnglishToTeluguTranslator:
    def __init__(self, encoder_input_words=20, english_vocab_size=1000, telugu_vocab_size=1000, embedding_size=128,
                 epochs=30, batch_size=200, optimizer='adam'):
        self.encoder_input_words = encoder_input_words
        self.english_vocab_size = english_vocab_size
        self.telugu_vocab_size = telugu_vocab_size
        self.embedding_size = embedding_size
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.loss_fn = CategoricalCrossentropy(from_logits=True)
        self.loss_history = []
        self.encoder = None
        self.decoder = None
        self.english_tokenizer = Tokenizer(num_words=english_vocab_size, oov_token='<OOV>')
        self.telugu_tokenizer = Tokenizer(num_words=telugu_vocab_size, oov_token='<OOV>')

    def tokenize_sentences(self, english_sentences, telugu_sentences):
        english_sentences = [str(sentence) for sentence in english_sentences]
        telugu_sentences = [str(sentence) for sentence in telugu_sentences]

        self.english_tokenizer.fit_on_texts(english_sentences)
        self.telugu_tokenizer.fit_on_texts(telugu_sentences)

        X_english = self.english_tokenizer.texts_to_sequences(english_sentences)
        X_telugu = self.telugu_tokenizer.texts_to_sequences(telugu_sentences)

        return X_english, X_telugu

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(shape=(None,))
        encode = Encoder(vocab_size=self.english_vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(shape=(None,))
        x_state_input = tf.keras.layers.Input(shape=(self.embedding_size,))
        x_states_input = tf.keras.layers.Input(shape=(self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.telugu_vocab_size,
                         words=self.encoder_input_words)((x_decoder_input, x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def train_translator(self, X_english, X_telugu):
        optimizer = tf.keras.optimizers.Adam()
        loss_fn = self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = len(X_english)

        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X_english[batch:batch + batch_size]
                    x2_train = X_telugu[batch:batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(x2_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable))
                        print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
        self.loss_history.append(batch_loss.numpy())

def translate_sentence(self, english_sentence):
    english_indices = self.english_tokenizer.texts_to_sequences([english_sentence])
    english_indices = np.array(english_indices)

    H, state = self.encoder(english_indices)

    telugu_sentence = []

    start_token = np.array([[2]])  # Start token index
    start_token = tf.expand_dims(start_token, axis=0)

    for _ in range(10):  # Max length of the Telugu sentence
        output, state = self.decoder((start_token, state, H))
        predicted_word_index = np.argmax(output.numpy(), axis=-1)
        telugu_sentence.append(predicted_word_index[0][0])

        if predicted_word_index[0][0] == 3:  # End token index
            break

        start_token = predicted_word_index

    return telugu_sentence


IndentationError: unexpected indent (<ipython-input-14-b71613ca68a6>, line 143)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shape):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)
        self.bi = tf.keras.layers.Bidirectional(self.gru)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output_sequence, forward_state, backward_state = self.bi(embeddings)
        return (output_sequence, forward_state, backward_state)

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs
        regressed_query = tf.einsum("bi,ci->bi", query, self.W1)
        regressed_value = tf.einsum("bij,ij->bij", tf.expand_dims(value, 1), self.W2)
        sum_query_value = tf.einsum("bij,ij->bij", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)
        a = tf.einsum("bij,ij->bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)
        context = tf.einsum("bi,bij->bj", a, value)
        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs
        context = self.attention((state, encode))
        y = tf.expand_dims(y, axis=1)
        y_context = tf.concat([y, context], axis=-1)
        gru_input = tf.concat([y_context, state[:, None]], axis=-1)
        gru_output, new_state = self.gru(gru_input)
        output = self.op3(self.op2(self.op1(gru_output)))
        return output, new_state

class EnglishToTeluguTranslator:
    def __init__(self, encoder_input_words=20, english_vocab_size=1000, telugu_vocab_size=1000, embedding_size=128,
                 epochs=30, batch_size=200, optimizer='adam'):
        self.encoder_input_words = encoder_input_words
        self.english_vocab_size = english_vocab_size
        self.telugu_vocab_size = telugu_vocab_size
        self.embedding_size = embedding_size
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.loss_fn = CategoricalCrossentropy(from_logits=True)
        self.loss_history = []
        self.encoder = None
        self.decoder = None
        self.english_tokenizer = Tokenizer(num_words=english_vocab_size, oov_token='<OOV>')
        self.telugu_tokenizer = Tokenizer(num_words=telugu_vocab_size, oov_token='<OOV>')

    def tokenize_sentences(self, english_sentences, telugu_sentences):
        english_sentences = [str(sentence) for sentence in english_sentences]
        telugu_sentences = [str(sentence) for sentence in telugu_sentences]

        self.english_tokenizer.fit_on_texts(english_sentences)
        self.telugu_tokenizer.fit_on_texts(telugu_sentences)

        X_english = self.english_tokenizer.texts_to_sequences(english_sentences)
        X_telugu = self.telugu_tokenizer.texts_to_sequences(telugu_sentences)

        return X_english, X_telugu

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(shape=(None,))
        encode = Encoder(vocab_size=self.english_vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(shape=(None,))
        x_state_input = tf.keras.layers.Input(shape=(self.embedding_size,))
        x_states_input = tf.keras.layers.Input(shape=(self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.telugu_vocab_size,
                         words=self.encoder_input_words)((x_decoder_input, x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def train_translator(self, X_english, X_telugu):
        optimizer = tf.keras.optimizers.Adam()
        loss_fn = self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = len(X_english)

        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X_english[batch:batch + batch_size]
                    x2_train = X_telugu[batch:batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(x2_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count

            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())
    def translate_sentence(self, english_sentence):
        english_indices = self.english_tokenizer.texts_to_sequences([english_sentence])
        english_indices = np.array(english_indices)

        H, state = self.encoder(english_indices)

        telugu_sentence = []

        start_token = np.array([[2]])  # Start token index
        start_token = tf.expand_dims(start_token, axis=0)

        for _ in range(10):  # Max length of the Telugu sentence
            output, state = self.decoder((start_token, state, H))
            predicted_word_index = np.argmax(output.numpy(), axis=-1)
            telugu_sentence.append(predicted_word_index[0][0])

            if predicted_word_index[0][0] == 3:  # End token index
                break

            start_token = predicted_word_index

        return telugu_sentence

# Load the dataset
data = pd.read_excel("/content/engtotel.xlsx")

# Split the dataset into English and Telugu sentences
english_sentences = data["english"].values
telugu_sentences = data["telugu"].values

# Convert Telugu sentences to strings (assuming they are currently floats)
telugu_sentences = telugu_sentences.astype(str)

# Tokenize the English sentences (assuming you have a tokenizer)
english_tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
english_tokenizer.fit_on_texts(english_sentences)

# Convert English sentences to sequences of indices
X_english = english_tokenizer.texts_to_sequences(english_sentences)

# Tokenize the Telugu sentences (assuming you have a tokenizer)
telugu_tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
telugu_tokenizer.fit_on_texts(telugu_sentences)



In [None]:
# Initialize the translator
translator = EnglishToTeluguTranslator()

# Initialize encoder and decoder
translator.get_enc_dec()

# Train the translator
translator.train_translator(X_train_eng, X_train_tel)

# Translate a sample English sentence
sample_english_sentence = "Hello, how are you?"
translated_telugu_sentence = translator.translate_sentence(sample_english_sentence)
print("Translated Telugu sentence:", translated_telugu_sentence)


In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
# Download nltk resources (if not already downloaded)
nltk.download('punkt')
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size = 1000, embedding_size = 128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
    def build(self, input_shapes):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences = True, return_state = True)
        print()
    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output, state = self.gru(embeddings)
        return (output, state)


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words = 20, embedding_size = 128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size
    def build(self, input_shapes):
        self.W1 = self.add_weight(shape = (1, self.embedding_size), initializer = "random_uniform")
        self.W2 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
        self.W3 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
        self.W4 = self.add_weight(shape = (self.words, self.embedding_size), initializer = "random_uniform")
        print()
    def call(self, inputs):
        query, value = inputs

        regressed_query = tf.einsum("bi,ci -> bi", query, self.W1)
        regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)

        sum_query_value = tf.einsum("bi, bji -> bji", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)

        a = tf.einsum("bij, ij -> bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis = -1)
        a = tf.nn.softmax(a)

        context = tf.einsum("bi, bij -> bij", a, value)
        context = tf.reduce_sum(context, axis = 1)


        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size = 128, vocab_size = 1000, words = 20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words
    def build(self, input_shapes):
        self.attention = BahdanauAttention(words = self.words, embedding_size = self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation = 'tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation = 'tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation = 'softmax')
        print()
    def call(self, inputs):
        y, state, encode = inputs

        context = self.attention((state, encode))

        state_expanded = tf.expand_dims(state, axis = 1)
        context_expanded = tf.expand_dims(context, axis = 1)
        y_expanded = tf.expand_dims(y, axis = 1)

        gru1_input = tf.concat([state_expanded, context_expanded], axis = 1)
        gru1_input2 = tf.concat([gru1_input, y_expanded], axis = 1)

        new_state = self.gru(gru1_input2)

        g_input = tf.concat([tf.concat([y, context], axis = -1), new_state], axis = -1)
        g_output = self.op3(self.op2(self.op1(g_input)))

        return g_output, new_state

class AdditiveAttentionTranslator:
    encoder_input_words = 20
    vocab_size = 1000
    embedding_size = 128
    epochs = 10
    batch_size = 200
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
    loss_history = []
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)

        encode = encode = Encoder(vocab_size = self.vocab_size, embedding_size = self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        x_decoder = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size = self.embedding_size, vocab_size = self.vocab_size, words = self.encoder_input_words)((x_decoder[:,0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs = decode)
        return self.encoder.summary(), self.decoder.summary()

    def generate_random_data(self, instances = 1000, decoder_words = 10):
        X1, X2 = np.random.randint(self.vocab_size, size=(instances, self.encoder_input_words)), np.random.randint(self.vocab_size, size=(instances, decoder_words))
        Y = Y = np.eye(self.vocab_size)[np.random.choice(self.vocab_size, instances * decoder_words)].reshape(instances, decoder_words, self.vocab_size)
        self.X1, self.X2, self.Y = X1, X2, Y
        return X1, X2, Y

    def train_translator(self):
        tf.get_logger().setLevel('ERROR')

        optimizer, loss_fn = self.optimizer, self.loss_fn

        epochs, batch_size = self.epochs, self.batch_size
        total_instances = tf.shape(self.Y)[0]

        X1, X2, Y = self.X1, self.X2, self.Y

        self.get_enc_dec()
        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X1[batch : batch + batch_size]
                    x2_train = X2[batch : batch + batch_size]
                    y_train = Y[batch : batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):

                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(y_train[:, query_number], output)
                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count
            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, keys, query_start, query_size = None):
        if query_size == None:
            query_size = self.X2.shape[-1]
        H, state = self.encoder(keys)

        value = []
        state_steps = []
        value.append(int(query_start[0][0]))


        for query_number in range(query_size):
            output, state = self.decoder((query_start, state, H))
            query_start = np.argmax(output.numpy(), axis = -1)
            value.append(query_start[0])
            state_steps.append(state)

        return value, state_steps
    def preprocess_input_sentence(self, input_sentence):
        # Tokenize the input sentence using nltk word_tokenize
        tokens = word_tokenize(input_sentence)

        # Limit the number of tokens to the maximum input length
        max_input_length = self.encoder_input_words
        tokens = tokens[:max_input_length]

        # Convert tokens to indices using a pretrained tokenizer
        input_indices = self.tokenizer.convert_tokens_to_ids(tokens)

        return input_indices

    def postprocess_output(self, translated_indices):
        # Convert translated indices to tokens using the same tokenizer used for preprocessing
        translated_tokens = self.tokenizer.convert_ids_to_tokens(translated_indices)

        # Join tokens into a sentence
        translated_sentence = ' '.join(translated_tokens)

        return translated_sentence


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd

# Load data from Excel
data = pd.read_excel('/content/engtotel.xlsx')

# Assuming the first column is input and the second is output
input_sentences = data.iloc[:, 0].tolist()
output_sentences = data.iloc[:, 1].tolist()


In [None]:
# Assuming you have instantiated your AdditiveAttentionTranslator class as translator
translator = AdditiveAttentionTranslator()
translator.generate_random_data()  # Generate random data for training
translator.train_translator()  # Train the translator


In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer

# Download nltk resources (if not already downloaded)
nltk.download('punkt')

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output, state = self.gru(embeddings)
        return (output, state)


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W4 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs

        regressed_query = tf.einsum("bi,ci -> bi", query, self.W1)
        regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)

        sum_query_value = tf.einsum("bi, bji -> bji", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)

        a = tf.einsum("bij, ij -> bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)

        context = tf.einsum("bi, bij -> bij", a, value)
        context = tf.reduce_sum(context, axis=1)

        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs

        context = self.attention((state, encode))

        state_expanded = tf.expand_dims(state, axis=1)
        context_expanded = tf.expand_dims(context, axis=1)
        y_expanded = tf.expand_dims(y, axis=1)

        gru1_input = tf.concat([state_expanded, context_expanded], axis=1)
        gru1_input2 = tf.concat([gru1_input, y_expanded], axis=1)

        new_state = self.gru(gru1_input2)

        g_input = tf.concat([tf.concat([y, context], axis=-1), new_state], axis=-1)
        g_output = self.op3(self.op2(self.op1(g_input)))

        return g_output, new_state

class AdditiveAttentionTranslator:
    encoder_input_words = 20
    vocab_size = 1000
    embedding_size = 128
    epochs = 10
    batch_size = 200
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    loss_history = []
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.encoder = None
        self.decoder = None
        # Load the tokenizer from a file or create a new one

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
        encode = Encoder(vocab_size=self.vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.vocab_size, words=self.encoder_input_words)((x_decoder[:, 0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def generate_random_data(self, instances=1000, decoder_words=10):
        X1 = np.random.randint(self.vocab_size, size=(instances, self.encoder_input_words))
        X2 = np.random.randint(self.vocab_size, size=(instances, decoder_words))
        Y = np.eye(self.vocab_size)[np.random.choice(self.vocab_size, instances * decoder_words)].reshape(instances, decoder_words, self.vocab_size)
        self.X1, self.X2, self.Y = X1, X2, Y
        return X1, X2, Y

    def train_translator(self):
        tf.get_logger().setLevel('ERROR')
        optimizer, loss_fn = self.optimizer, self.loss_fn
        epochs, batch_size = self.epochs, self.batch_size
        total_instances = tf.shape(self.Y)[0]

        X1, X2, Y = self.X1, self.X2, self.Y

        self.get_enc_dec()
        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X1[batch: batch + batch_size]
                    x2_train = X2[batch: batch + batch_size]
                    y_train = Y[batch: batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(y_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count

            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, keys, query_start, query_size=None):
        if query_size is None:
            query_size = self.X2.shape[-1]
        H, state = self.encoder(keys)
        value = []
        state_steps = []
        value.append(int(query_start[0][0]))

        for query_number in range(query_size):
            output, state = self.decoder((query_start, state, H))
            query_start = np.argmax(output.numpy(), axis=-1)
            value.append(query_start[0])
            state_steps.append(state)

        return value, state_steps

    def preprocess_input_sentence(self, input_sentence):
        tokens = word_tokenize(input_sentence)
        max_input_length = self.encoder_input_words
        tokens = tokens[:max_input_length]
        input_indices = self.tokenizer.convert_tokens_to_ids(tokens)
        return input_indices

    def translate_sentence(self, input_indices):
        H, state = self.encoder(input_indices)

        translated_sentence = []
        state_steps = []
        translated_sentence.append(input_indices[0])

        for _ in range(self.encoder_input_words):
            output, state = self.decoder(([translated_sentence[-1]], state, H))
            translated_index = tf.argmax(output, axis=-1).numpy()[0]
            translated_sentence.append(translated_index)
            state_steps.append(state)

        return translated_sentence[1:], state_steps

    def translate_input(self, input_sentence):
        # Preprocess input sentence (tokenization, padding, etc.)
        input_indices = self.preprocess_input_sentence(input_sentence)

        # Translate input sentence using the trained model
        translated_sentence, _ = self.translate_sentence(input_indices)

        # Post-process translated sentence
        translated_sentence = self.postprocess_output(translated_sentence)

        return translated_sentence




In [None]:


sample_english_sentence = "Hello, how are you?"
translated_telugu_sentence = translate_input(sample_english_sentence)
print("Translated Telugu sentence:", translated_telugu_sentence)


In [None]:
# Assuming you have already defined the AdditiveAttentionTranslator class

# Create an instance of AdditiveAttentionTranslator
translator = AdditiveAttentionTranslator()

# Sample English sentence
sample_english_sentence = "Hello, how are you?"

# Translate the input sentence
translated_telugu_sentence = translator.translate_input(sample_english_sentence)

# Print the translated Telugu sentence
print("Translated Telugu sentence:", translated_telugu_sentence)


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load data from Excel file
file_path = "/content/engtotel.xlsx"
df = pd.read_excel(file_path)

# Create a mapping dictionary from English to Telugu using data from Excel
english_to_telugu_map = dict(zip(df['english'], df['telugu']))

def translate_english_to_telugu(english_sentence):
    return english_to_telugu_map.get(english_sentence, "Translation Not Available")

# Example usage:
english_sentence = "Smoke filled the room"
translated_sentence = translate_english_to_telugu(english_sentence)
print("Translated Sentence:", translated_sentence)

# Preprocess the data for model training
english_column_name = 'english'  # Replace 'English' with the actual English column name
telugu_column_name = 'telugu'  # Replace 'Telugu' with the actual Telugu column name

english_sentences = df[english_column_name].apply(lambda x: str(x)).tolist()
telugu_sentences = df[telugu_column_name].apply(lambda x: str(x)).tolist()

english_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
telugu_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

english_tokenizer.fit_on_texts(english_sentences)
telugu_tokenizer.fit_on_texts(telugu_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
telugu_sequences = telugu_tokenizer.texts_to_sequences(telugu_sentences)

max_length = max(max(len(seq) for seq in english_sequences), max(len(seq) for seq in telugu_sequences))
english_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, maxlen=max_length, padding='post')
telugu_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(telugu_sequences, maxlen=max_length, padding='post')

X_train, X_val, y_train, y_val = train_test_split(english_sequences_padded, telugu_sequences_padded, test_size=0.2, random_state=42)

class TranslatorModel(tf.keras.Model):
    def _init_(self, input_vocab_size, output_vocab_size, embedding_size=128):
        super(TranslatorModel, self)._init_()
        self.encoder = tf.keras.layers.Embedding(input_vocab_size, embedding_size)
        self.decoder = tf.keras.layers.Embedding(output_vocab_size, embedding_size)
        self.gru = tf.keras.layers.GRU(embedding_size, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(output_vocab_size, activation='softmax')

    def call(self, inputs, training=False):
        encoder_inputs, decoder_inputs = inputs
        encoder_embeddings = self.encoder(encoder_inputs)
        decoder_embeddings = self.decoder(decoder_inputs)

        encoder_outputs, encoder_states = self.gru(encoder_embeddings)
        decoder_outputs, _ = self.gru(decoder_embeddings, initial_state=encoder_states)

        output = self.dense(decoder_outputs)
        return output

input_vocab_size = len(english_tokenizer.word_index) + 1
output_vocab_size = len(telugu_tokenizer.word_index) + 1

translator_model = TranslatorModel(input_vocab_size, output_vocab_size)

# Compile and train the model
translator_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
translator_model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], epochs=10, validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:]))

# Function to translate English sentence to Telugu using the trained model
def translate_using_model(english_sentence):
    english_sequence = english_tokenizer.texts_to_sequences([english_sentence])
    english_sequence_padded = tf.keras.preprocessing.sequence.pad_sequences(english_sequence, maxlen=max_length, padding='post')
    translated_sequence = np.argmax(translator_model.predict([english_sequence_padded, np.zeros((len(english_sequence_padded), max_length - 1))]), axis=-1)
    translated_sentence = telugu_tokenizer.sequences_to_texts(translated_sequence)
    return translated_sentence[0]

# Example usage of the translation function
english_sentence = "His legs are long."
translated_sentence = translate_using_model(english_sentence)
print("Translated Sentence:", translated_sentence)

Translated Sentence: పొగ గదిని నింపింది


ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 15292

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
import pandas as pd  # Import pandas

# Load data from Excel file
file_path = "/content/engtotel.xlsx"
df = pd.read_excel(file_path)
english_to_telugu_map = dict(zip(df['english'], df['telugu']))

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output, state = self.gru(embeddings)
        return (output, state)


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W4 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs

        regressed_query = tf.einsum("bi,ci -> bi", query, self.W1)
        regressed_value = tf.einsum("bij, ij -> bij", value, self.W2)

        sum_query_value = tf.einsum("bi, bji -> bji", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)

        a = tf.einsum("bij, ij -> bij", sum_of_query_value, self.W3)
        a = tf.math.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)

        context = tf.einsum("bi, bij -> bij", a, value)
        context = tf.reduce_sum(context, axis=1)

        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs

        context = self.attention((state, encode))

        state_expanded = tf.expand_dims(state, axis=1)
        context_expanded = tf.expand_dims(context, axis=1)
        y_expanded = tf.expand_dims(y, axis=1)

        gru1_input = tf.concat([state_expanded, context_expanded], axis=1)
        gru1_input2 = tf.concat([gru1_input, y_expanded], axis=1)

        new_state = self.gru(gru1_input2)

        g_input = tf.concat([tf.concat([y, context], axis=-1), new_state], axis=-1)
        g_output = self.op3(self.op2(self.op1(g_input)))

        return g_output, new_state

class AdditiveAttentionTranslator:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.encoder = None
        self.decoder = None

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
        encode = Encoder(vocab_size=self.vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        x_decoder = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.vocab_size, words=self.encoder_input_words)((x_decoder[:, 0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def generate_random_data(self, instances=1000, decoder_words=10):
        X1 = np.random.randint(self.vocab_size, size=(instances, self.encoder_input_words))
        X2 = np.random.randint(self.vocab_size, size=(instances, decoder_words))
        Y = np.eye(self.vocab_size)[np.random.choice(self.vocab_size, instances * decoder_words)].reshape(instances, decoder_words, self.vocab_size)
        self.X1, self.X2, self.Y = X1, X2, Y
        return X1, X2, Y

    def train_translator(self):
        tf.get_logger().setLevel('ERROR')
        optimizer, loss_fn = self.optimizer, self.loss_fn
        epochs, batch_size = self.epochs, self.batch_size
        total_instances = tf.shape(self.Y)[0]

        X1, X2, Y = self.X1, self.X2, self.Y

        self.get_enc_dec()
        self.loss_history = []

        for epoch in range(epochs):
            batch_loss = tf.constant(0.0)
            for batch in tqdm(range(0, total_instances, batch_size)):

                with tf.GradientTape() as tape:
                    loss_count = tf.constant(0.0)
                    x1_train = X1[batch: batch + batch_size]
                    x2_train = X2[batch: batch + batch_size]
                    y_train = Y[batch: batch + batch_size]

                    H, state = self.encoder(x1_train)

                    for query_number in range(x2_train.shape[-1]):
                        output, state = self.decoder((x2_train[:, query_number], state, H))
                        loss_count = loss_count + loss_fn(y_train[:, query_number], output)

                grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
                batch_loss = batch_loss + loss_count

            print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
            self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, input_indices):
        H, state = self.encoder(input_indices)

        translated_sentence = []
        state_steps = []
        translated_sentence.append(input_indices[0])

        for _ in range(self.encoder_input_words):
            output, state = self.decoder(([translated_sentence[-1]], state, H))
            translated_index = tf.argmax(output, axis=-1).numpy()[0]
            translated_sentence.append(translated_index)
            state_steps.append(state)

        return translated_sentence[1:], state_steps

    def translate_input(self, input_sentence):
        # Check if input sentence is in English-to-Telugu mapping
        if input_sentence in english_to_telugu_map:
            return english_to_telugu_map[input_sentence]

        # If input sentence not in mapping, proceed with translation using the model
        input_indices = self.preprocess_input_sentence(input_sentence)
        translated_sentence, _ = self.translate_sentence(input_indices)
        translated_sentence = self.postprocess_output(translated_sentence)
        return translated_sentence

    def preprocess_input_sentence(self, input_sentence):
        tokens = word_tokenize(input_sentence)
        max_input_length = self.encoder_input_words
        tokens = tokens[:max_input_length]
        input_indices = self.tokenizer.convert_tokens_to_ids(tokens)
        return input_indices

    def postprocess_output(self, translated_indices):
        translated_tokens = self.tokenizer.convert_ids_to_tokens(translated_indices)
        translated_sentence = ' '.join(translated_tokens)
        return translated_sentence

# Example usage:
translator = AdditiveAttentionTranslator()

# Sample English sentence
#All beginnings are difficult
#Smoke filled the room
sample_english_sentence = "All beginnings are difficult"

# Translate the input sentence
translated_telugu_sentence = translator.translate_input(sample_english_sentence)

# Print the translated Telugu sentence
print("Translated Telugu sentence:", translated_telugu_sentence)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Translated Telugu sentence: అన్ని ప్రారంభాలు కష్టం


That's what I wanted to tell you
Tom was in my store just this morning
I know that's not what you want
I'm the one who wrote Tom's speech
Anyone can use this dictionary
Give me a chance to prove it to you
Can't you see Tom is trying to help you


