<a href="https://colab.research.google.com/github/sutharimanikanta/Golab/blob/main/BahdanauAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
# Load the dataset
data = pd.read_excel("/content/engtotel.xlsx")

# Split the dataset into English and Telugu sentences
english_sentences = data["english"].values
telugu_sentences = data["telugu"].values
# # Assume X_english is ready with English sequences, similarly prepare X_telugu
# english_sentences = data["english"].values
# telugu_sentences = data["telugu"].values


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
import nltk
import numpy as np
import tensorflow as tf
from nltk.probability import FreqDist
from tqdm import tqdm

# Define the preprocess function
def preprocess(text):
    text = str(text)
    # Split the text using regex for both English and Telugu
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    # Filter out empty strings
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    # Tokenize the preprocessed text
    tokens = nltk.word_tokenize(' '.join(preprocessed))
    return tokens

# Preprocess the English and Telugu sentences
tokens_en = [preprocess(sent) for sent in english_sentences]
tokens_te = [preprocess(sent) for sent in telugu_sentences]

# Flatten the lists of tokens
tokens_en_flat = [token for sublist in tokens_en for token in sublist]
tokens_te_flat = [token for sublist in tokens_te for token in sublist]

# Create frequency distributions for each language
freq_dist_en = FreqDist(tokens_en_flat)
freq_dist_te = FreqDist(tokens_te_flat)

# Create sorted lists of unique words for each language
all_words_en = sorted(list(set(tokens_en_flat)))
all_words_te = sorted(list(set(tokens_te_flat)))

# Create dictionaries to map tokens to token IDs for each language
vocab_en = {token: integer for integer, token in enumerate(all_words_en)}
vocab_te = {token: integer for integer, token in enumerate(all_words_te)}

# Convert tokens to token IDs for each language
token_ids_en = [[vocab_en[token] for token in sent] for sent in tokens_en]
token_ids_te = [[vocab_te[token] for token in sent] for sent in tokens_te]



In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size=1000, embedding_size=128):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size, return_sequences=True, return_state=True)

    def call(self, inputs):
        words = inputs
        embeddings = self.embedding_layer(words)
        output, state = self.gru(embeddings)
        return (output, state)

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, words=20, embedding_size=128):
        super(BahdanauAttention, self).__init__()
        self.words = words
        self.embedding_size = embedding_size

    def build(self, input_shapes):
        self.W1 = self.add_weight(shape=(1, self.embedding_size), initializer="random_uniform")
        self.W2 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")
        self.W3 = self.add_weight(shape=(self.words, self.embedding_size), initializer="random_uniform")

    def call(self, inputs):
        query, value = inputs
        regressed_query = tf.einsum("bi,ci -> bi", query, self.W1)
        regressed_value = tf.einsum("bij,ij -> bij", value, self.W2)
        sum_query_value = tf.einsum("bi,bji -> bji", regressed_query, regressed_value)
        sum_of_query_value = tf.nn.tanh(sum_query_value)
        a = tf.einsum("bij,ij -> bij", sum_of_query_value, self.W3)
        a = tf.reduce_sum(a, axis=-1)
        a = tf.nn.softmax(a)
        context = tf.einsum("bi,bij -> bij", a, value)
        context = tf.reduce_sum(context, axis=1)
        return context

class Decoder(tf.keras.layers.Layer):
    def __init__(self, embedding_size=128, vocab_size=1000, words=20):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.words = words

    def build(self, input_shapes):
        self.attention = BahdanauAttention(words=self.words, embedding_size=self.embedding_size)
        self.gru = tf.keras.layers.GRU(self.embedding_size)
        self.op1 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op2 = tf.keras.layers.Dense(self.embedding_size * 10, activation='tanh')
        self.op3 = tf.keras.layers.Dense(self.vocab_size, activation='softmax')

    def call(self, inputs):
        y, state, encode = inputs
        context = self.attention((state, encode))
        state_expanded = tf.expand_dims(state, axis=1)
        context_expanded = tf.expand_dims(context, axis=1)
        y_expanded = tf.expand_dims(y, axis=1)
        gru1_input = tf.concat([state_expanded, context_expanded], axis=1)
        gru1_input2 = tf.concat([gru1_input, y_expanded], axis=1)
        new_state = self.gru(gru1_input2)
        g_input = tf.concat([tf.concat([y, context], axis=-1), new_state], axis=-1)
        g_output = self.op3(self.op2(self.op1(g_input)))
        return g_output, new_state

class AdditiveAttentionTranslator:
    def __init__(self):
        self.encoder_input_words = 20
        self.vocab_size = 1000
        self.embedding_size = 128
        self.epochs = 5
        self.batch_size = 200
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        self.loss_history = []

    def get_enc_dec(self):
        x_encoder_input = tf.keras.layers.Input(self.encoder_input_words)
        encode = Encoder(vocab_size=self.vocab_size, embedding_size=self.embedding_size)(x_encoder_input)
        self.encoder = tf.keras.Model(inputs=x_encoder_input, outputs=encode)

        x_decoder_input = tf.keras.layers.Input(1)
        x_decoder = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)(x_decoder_input)
        x_state_input = tf.keras.layers.Input(self.embedding_size)
        x_states_input = tf.keras.layers.Input((self.encoder_input_words, self.embedding_size))

        decode = Decoder(embedding_size=self.embedding_size, vocab_size=self.vocab_size, words=self.encoder_input_words)((x_decoder[:,0], x_state_input, x_states_input))
        self.decoder = tf.keras.Model(inputs=[x_decoder_input, x_state_input, x_states_input], outputs=decode)
        return self.encoder.summary(), self.decoder.summary()

    def generate_data_from_tokens(self, tokens_en, tokens_te, decoder_words=10):
       X1 = np.array([seq[:self.encoder_input_words] + [0] * (self.encoder_input_words - len(seq)) for seq in tokens_en])
       X2 = np.array([seq[:decoder_words] + [0] * (decoder_words - len(seq)) for seq in tokens_te])
       max_seq_length = max(len(seq) for seq in tokens_te)
       Y = np.zeros((len(tokens_en), max_seq_length - 1, self.vocab_size))
       for i, token_ids in enumerate(tokens_te):
           for j in range(1, len(token_ids)):
               token_id = token_ids[j]
               if token_id < self.vocab_size:
                   Y[i, j - 1, token_id] = 1
       return X1, X2, Y


    def train_translator(self, X1, X2, Y):

       tf.get_logger().setLevel('ERROR')
       optimizer, loss_fn = self.optimizer, self.loss_fn
       epochs, batch_size = self.epochs, self.batch_size
       self.loss_history = []
       for epoch in range(epochs):
           batch_loss = tf.constant(0.0)
           total_instances = len(Y)
           for batch in tqdm(range(0, total_instances, batch_size)):
               with tf.GradientTape() as tape:
                   loss_count = tf.constant(0.0)
                   x1_train = X1[batch : batch + batch_size]
                   x2_train = X2[batch : batch + batch_size]
                   y_train = Y[batch : batch + batch_size]
                   H, state = self.encoder(x1_train)
                   for query_number in range(min(y_train.shape[1], y_train.shape[1])):  # Ensure query_number does not exceed decoder input size
                       output, state = self.decoder((tf.expand_dims(x2_train[:, 0], axis=-1), state, H))
                       loss_count = loss_count + loss_fn(y_train[:, query_number, :], output)  # Fix indexing for y_train
               grads = tape.gradient(loss_count, self.encoder.trainable_weights + self.decoder.trainable_weights)
               optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
               batch_loss = batch_loss + loss_count
           print("Epoch: " + str(epoch + 1) + "/" + str(epochs) + " : Error " + str(batch_loss.numpy()))
           self.loss_history.append(batch_loss.numpy())

    def translate_sentence(self, keys, query_start, query_size=None):
        if query_size is None:
            query_size = self.decoder.input_shape[0][1]
        keys = tf.concat([keys, query_start, state, H], axis=0)
        H, state = self.encoder(keys)
        value = []
        state_steps = []
        value.append(int(query_start[0][0]))
        for query_number in range(query_size):
            output, state = self.decoder((query_start, state, H))
            query_start = np.argmax(output.numpy(), axis=-1)
            value.append(query_start[0])
            state_steps.append(state)
        return value, state_steps

translator = AdditiveAttentionTranslator()
translator.get_enc_dec()


# HI

In [None]:


# Translate a sentence
keys = token_ids_en[0]  # Assuming the first sentence is to be translated
query_start = token_ids_te[0]  # Assuming the first Telugu sentence is the target
translated_sentence, state_steps = translator.translate_sentence(keys, query_start)

# Print the translated sentence
print("Translated sentence:", translated_sentence)


In [None]:
translator.save_model('translater.yaml')
