<a href="https://colab.research.google.com/github/yelagampragathi/NLP_16/blob/main/CRAZYCATS_ASS_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.

Data Preprocessing


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example dataset (English to French pairs)
data = [
    ("What is your name?", "Quel est votre nom ?"),
    ("Where do you live?", "Où habitez-vous ?"),
    ("How old are you?", "Quel âge avez-vous ?"),
    ("What do you do for a living?", "Que faites-vous dans la vie ?"),
    ("What is your favorite color?", "Quelle est votre couleur préférée ?"),
    ("Do you like sports?", "Aimez-vous le sport ?"),
    ("Can you speak English?", "Pouvez-vous parler anglais ?"),
    ("What is your profession?", "Quelle est votre profession ?"),
    ("What is your favorite food?", "Quel est votre plat préféré ?"),
    ("Where do you work?", "Où travaillez-vous ?")
]

# Separate into English and French sentences
english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]

# Tokenize the English and French sentences
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)

# Convert text to sequences of integers
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

# Pad the sequences to have uniform length
max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')

# Get the size of the vocabularies
english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1

# Output
print("English Sentences (Original Text):")
print(english_sentences)

print("\nFrench Sentences (Original Text):")
print(french_sentences)

print("\nTokenized English Sequences:")
print(english_sequences)

print("\nTokenized French Sequences:")
print(french_sequences)

print("\nPadded English Sequences:")
print(english_padded)

print("\nPadded French Sequences:")
print(french_padded)

print(f"\nEnglish Vocabulary Size: {english_vocab_size}")
print(f"French Vocabulary Size: {french_vocab_size}")


English Sentences (Original Text):
['What is your name?', 'Where do you live?', 'How old are you?', 'What do you do for a living?', 'What is your favorite color?', 'Do you like sports?', 'Can you speak English?', 'What is your profession?', 'What is your favorite food?', 'Where do you work?']

French Sentences (Original Text):
['Quel est votre nom ?', 'Où habitez-vous ?', 'Quel âge avez-vous ?', 'Que faites-vous dans la vie ?', 'Quelle est votre couleur préférée ?', 'Aimez-vous le sport ?', 'Pouvez-vous parler anglais ?', 'Quelle est votre profession ?', 'Quel est votre plat préféré ?', 'Où travaillez-vous ?']

Tokenized English Sequences:
[[2, 4, 5, 8], [6, 3, 1, 9], [10, 11, 12, 1], [2, 3, 1, 3, 13, 14, 15], [2, 4, 5, 7, 16], [3, 1, 17, 18], [19, 1, 20, 21], [2, 4, 5, 22], [2, 4, 5, 7, 23], [6, 3, 1, 24]]

Tokenized French Sequences:
[[4, 2, 3, 7], [5, 8, 1], [4, 9, 10, 1], [11, 12, 1, 13, 14, 15], [6, 2, 3, 16, 17], [18, 1, 19, 20], [21, 1, 22, 23], [6, 2, 3, 24], [4, 2, 3, 25, 26],

(b) Build Seq2Seq Model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Define model parameters
embedding_dim = 64
lstm_units = 128

# Encoder
encoder_inputs = Input(shape=(max_english_seq_len,))
encoder_embedding = Embedding(english_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, encoder_state_h, encoder_state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_states = [encoder_state_h, encoder_state_c]

# Decoder
decoder_inputs = Input(shape=(max_french_seq_len,))
decoder_embedding = Embedding(french_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm, _, _ = LSTM(lstm_units, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(french_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example dataset (English to French pairs)
data = [
    ("What is your name?", "Quel est votre nom ?"),
    ("Where do you live?", "Où habitez-vous ?"),
    ("How old are you?", "Quel âge avez-vous ?"),
    ("What do you do for a living?", "Que faites-vous dans la vie ?"),
    ("What is your favorite color?", "Quelle est votre couleur préférée ?"),
    ("Do you like sports?", "Aimez-vous le sport ?"),
    ("Can you speak English?", "Pouvez-vous parler anglais ?"),
    ("What is your profession?", "Quelle est votre profession ?"),
    ("What is your favorite food?", "Quel est votre plat préféré ?"),
    ("Where do you work?", "Où travaillez-vous ?")
]

# Separate into English and French sentences
english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]

# Tokenize the English and French sentences
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)

# Convert text to sequences of integers
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

# Pad the sequences to have uniform length
max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')

# Prepare decoder input and output data
decoder_input_data = french_padded[:, :-1]
decoder_output_data = french_padded[:, 1:]

# Expand decoder output to 3D shape for sparse_categorical_crossentropy
decoder_output_data = np.expand_dims(decoder_output_data, -1)

# Output
print("Decoder Input Data:")
print(decoder_input_data)

print("\nDecoder Output Data (3D):")
print(decoder_output_data)


Decoder Input Data:
[[ 4  2  3  7  0]
 [ 5  8  1  0  0]
 [ 4  9 10  1  0]
 [11 12  1 13 14]
 [ 6  2  3 16 17]
 [18  1 19 20  0]
 [21  1 22 23  0]
 [ 6  2  3 24  0]
 [ 4  2  3 25 26]
 [ 5 27  1  0  0]]

Decoder Output Data (3D):
[[[ 2]
  [ 3]
  [ 7]
  [ 0]
  [ 0]]

 [[ 8]
  [ 1]
  [ 0]
  [ 0]
  [ 0]]

 [[ 9]
  [10]
  [ 1]
  [ 0]
  [ 0]]

 [[12]
  [ 1]
  [13]
  [14]
  [15]]

 [[ 2]
  [ 3]
  [16]
  [17]
  [ 0]]

 [[ 1]
  [19]
  [20]
  [ 0]
  [ 0]]

 [[ 1]
  [22]
  [23]
  [ 0]
  [ 0]]

 [[ 2]
  [ 3]
  [24]
  [ 0]
  [ 0]]

 [[ 2]
  [ 3]
  [25]
  [26]
  [ 0]]

 [[27]
  [ 1]
  [ 0]
  [ 0]
  [ 0]]]


(d) Train the model on the dataset

In [None]:
# Train the model
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Example dataset (English to French pairs)
data = [
    ("What is your name?", "Quel est votre nom ?"),
    ("Where do you live?", "Où habitez-vous ?"),
    ("How old are you?", "Quel âge avez-vous ?"),
    ("What do you do for a living?", "Que faites-vous dans la vie ?"),
    ("What is your favorite color?", "Quelle est votre couleur préférée ?"),
    ("Do you like sports?", "Aimez-vous le sport ?"),
    ("Can you speak English?", "Pouvez-vous parler anglais ?"),
    ("What is your profession?", "Quelle est votre profession ?"),
    ("What is your favorite food?", "Quel est votre plat préféré ?"),
    ("Where do you work?", "Où travaillez-vous ?")
]

# Separate into English and French sentences
english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]

# Tokenize the English and French sentences
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)

# Convert text to sequences of integers
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

# Pad the sequences to have uniform length
max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')

# Prepare decoder input and output data
decoder_input_data = french_padded[:, :-1]
decoder_output_data = french_padded[:, 1:]

# Expand decoder output to 3D shape for sparse_categorical_crossentropy
decoder_output_data = np.expand_dims(decoder_output_data, -1)

# Define the parameters for the model
embedding_dim = 128
units = 256

# Define the encoder
encoder_inputs = Input(shape=(None,))  # English input shape
encoder_embedding = Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None,))  # French input shape
decoder_embedding = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(french_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [english_padded, decoder_input_data],
    decoder_output_data,
    batch_size=2,  # You can adjust the batch size
    epochs=10,    # Number of epochs for training
    validation_split=0.2,
    verbose=1      # Verbosity level
)

# Output the final metrics
print("Final Training Loss:", history.history['loss'][-1])
print("Final Training Accuracy:", history.history['accuracy'][-1])
print("Final Validation Loss:", history.history['val_loss'][-1])
print("Final Validation Accuracy:", history.history['val_accuracy'][-1])

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 187ms/step - accuracy: 0.3200 - loss: 3.3115 - val_accuracy: 0.4000 - val_loss: 3.2131
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.2967 - loss: 3.1685 - val_accuracy: 0.4000 - val_loss: 2.8731
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.3100 - loss: 2.7231 - val_accuracy: 0.4000 - val_loss: 2.4734
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.3467 - loss: 2.5356 - val_accuracy: 0.4000 - val_loss: 2.3194
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.2900 - loss: 2.3896 - val_accuracy: 0.4000 - val_loss: 2.3823
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.3700 - loss: 2.1635 - val_accuracy: 0.4000 - val_loss: 2.3252
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

(e) Inference Setup for Translation

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.models import Model

# Define the units and input shapes
lstm_units = 256  # The number of LSTM units you defined earlier

# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Use the decoder embedding layer from the training model
decoder_embedding_inf = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)

# Pass the embedding through the LSTM
decoder_lstm_inf, decoder_state_h_inf, decoder_state_c_inf = LSTM(lstm_units, return_sequences=True, return_state=True)(
    decoder_embedding_inf, initial_state=decoder_states_inputs)

# Connect the Dense layer to the LSTM output
decoder_outputs_inf = decoder_dense(decoder_lstm_inf)

# Create the decoder model
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inf] + [decoder_state_h_inf, decoder_state_c_inf])

# Print the model summaries
encoder_model.summary()
decoder_model.summary()


(f) Translate New Sentences

In [None]:
# Function to decode sequence (translation)
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence of length 1, initialized with the start token index
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = french_tokenizer.word_index.get('<start>', 0)  # Ensure this is a valid start token

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        # Predict the next token
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the predicted token index
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = french_tokenizer.index_word.get(sampled_token_index, '')  # Safe access

        # Check if the sampled word is defined
        if sampled_word:  # Only add if the word is valid
            decoded_sentence += ' ' + sampled_word

        # Check for stopping condition
        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_french_seq_len:  # Use the actual end token
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()  # Remove leading space

# Example translation
test_sentence = "hello"
test_sequence = english_tokenizer.texts_to_sequences([test_sentence])
test_padded = pad_sequences(test_sequence, maxlen=max_english_seq_len, padding='post')
translated_sentence = decode_sequence(test_padded)
print("Translated sentence:", translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Translated sentence: est est est est est est est
