<a href="https://colab.research.google.com/github/tanu26062006/Assignment-2/blob/main/Task34567.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import os
import re
import zipfile
import requests
from io import BytesIO
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.layers import Attention # Keras's built-in Attention layer

# --- Global Configuration Parameters (can be adjusted) ---
NUM_SAMPLES = 20000
BATCH_SIZE = 64
LATENT_DIM = 256 # Dimensionality of the LSTM's hidden state
EMBEDDING_DIM = 100 # Dimensionality of word embeddings
EPOCHS = 20 # Train for at least 10 epochs (as per Task 4)

# --- Data Path and File Names ---
DATA_DIR = "data"
ZIP_FILE_NAME = "fra-eng.zip"
TEXT_FILE_NAME = "/content/fra.txt"
DATA_URL = "http://www.manythings.org/anki/" + ZIP_FILE_NAME

# Initialize variables to avoid NameError if Task 3 fails
input_texts = []
target_texts = []
input_vocab_size = 0
target_vocab_size = 0
max_encoder_seq_length = 0
max_decoder_seq_length = 0
encoder_input_data = np.array([])
decoder_input_data = np.array([])
decoder_target_data = np.array([])
input_tokenizer = None
target_tokenizer = None
input_word_index = {}
target_word_index = {}
reverse_input_word_index = {}
reverse_target_word_index = {}
model = None # For the basic model from Task 4
model_with_attention = None # For the attention model from Task 6
history = None # For training history from Task 4 or Task 6

# ==============================================================================
# --- Task 3: Data Preparation for Sequence Learning ---
# ==============================================================================
print("--- Task 3: Data Preparation ---")

# 1. Download and Extract Dataset
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

data_file_path = os.path.join(DATA_DIR, TEXT_FILE_NAME)

if not os.path.exists(data_file_path):
    print(f"Downloading {ZIP_FILE_NAME} from {DATA_URL}...")
    try:
        response = requests.get(DATA_URL)
        response.raise_for_status() # Raise an exception for HTTP errors
        with zipfile.ZipFile(BytesIO(response.content)) as z:
            z.extract(TEXT_FILE_NAME, DATA_DIR)
        print(f"Extracted {TEXT_FILE_NAME} to {DATA_DIR}/")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        print("Please ensure you have an active internet connection or manually download and place 'fra.txt' in a 'data' folder.")
    else:
        if not os.path.exists(data_file_path):
             print(f"File '{TEXT_FILE_NAME}' not found after extraction. Please check the zip content.")
else:
    print(f"'{TEXT_FILE_NAME}' already exists in '{DATA_DIR}/'. Skipping download.")

# Load Data
if os.path.exists(data_file_path):
    with open(data_file_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    for line in lines[:min(NUM_SAMPLES, len(lines) - 1)]:
        try:
            # Some lines have a third part (attribution), discard it
            input_text, target_text, _ = line.split('\t')
        except ValueError:
            # Handle lines without attribution
            input_text, target_text = line.split('\t')

        # 2. Preprocess the text
        # Normalize English input text: lowercase, add spaces around punctuation, remove special chars
        input_text = input_text.lower()
        input_text = re.sub(r"([?.!,¿])", r" \1 ", input_text)
        input_text = re.sub(r'[" "]+', " ", input_text)
        input_text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", input_text) # Keep basic Latin alphabet and punctuation
        input_text = input_text.strip()

        # Normalize French target text: lowercase, add spaces around punctuation, remove special chars, add <start> and <end> tokens
        target_text = target_text.lower()
        target_text = re.sub(r"([?.!,¿])", r" \1 ", target_text)
        target_text = re.sub(r'[" "]+', " ", target_text)
        target_text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", target_text)
        target_text = target_text.strip()
        target_text = '<start> ' + target_text + ' <end>' # Crucial for sequence generation

        input_texts.append(input_text)
        target_texts.append(target_text)

    print(f"Number of sentence pairs loaded: {len(input_texts)}")
else:
    print("No data loaded. Cannot proceed with tokenization and model building.")

# Only proceed with tokenization if data was loaded
if input_texts:
    # Tokenize input (English) and output (French) sequences
    input_tokenizer = Tokenizer(filters='')
    input_tokenizer.fit_on_texts(input_texts)
    input_sequences = input_tokenizer.texts_to_sequences(input_texts)
    input_word_index = input_tokenizer.word_index
    input_vocab_size = len(input_word_index) + 1 # +1 for 0-indexed padding token

    target_tokenizer = Tokenizer(filters='')
    target_tokenizer.fit_on_texts(target_texts)
    target_sequences = target_tokenizer.texts_to_sequences(target_texts)
    target_word_index = target_tokenizer.word_index
    target_vocab_size = len(target_word_index) + 1 # +1 for 0-indexed padding token

    print(f"English (Input) Vocabulary Size: {input_vocab_size}")
    print(f"French (Target) Vocabulary Size: {target_vocab_size}")

    # Reverse word indexes for decoding during inference
    reverse_input_word_index = dict((i, word) for word, i in input_word_index.items())
    reverse_target_word_index = dict((i, word) for word, i in target_word_index.items())

    # Pad sequences for batching
    max_encoder_seq_length = max(len(seq) for seq in input_sequences)
    max_decoder_seq_length = max(len(seq) for seq in target_sequences)

    print(f"Max English (Encoder) sequence length: {max_encoder_seq_length}")
    print(f"Max French (Decoder) sequence length: {max_decoder_seq_length}")

    # Prepare input_tensor, target_tensor
    encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
    decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

    decoder_target_data = np.zeros(
        (len(target_sequences), max_decoder_seq_length),
        dtype='int32'
    )

    # Populate decoder_target_data (shifted by one for prediction)
    # The target sequence for the decoder is the same as decoder_input_data,
    # but shifted one step to the left, and with the <start> token removed.
    for i, seq in enumerate(target_sequences):
        for j, word_id in enumerate(seq):
            if j > 0: # Exclude the <start> token from the target output sequence
                decoder_target_data[i, j-1] = word_id

    print("\n--- Data Preparation Summary ---")
    print(f"Shape of Encoder Input Data: {encoder_input_data.shape}")
    print(f"Shape of Decoder Input Data: {decoder_input_data.shape}")
    print(f"Shape of Decoder Target Data: {decoder_target_data.shape}")
else:
    print("Data preparation skipped due to no data loaded.")


# ==============================================================================
# --- Task 4: Build Encoder and Decoder using LSTM (Keras) ---
# ==============================================================================
print("\n--- Task 4: Building and Training Model (Basic Encoder-Decoder) ---")

if input_vocab_size == 0 or target_vocab_size == 0:
    print("SKIPPING Task 4: Vocabulary sizes not set. Please ensure Task 3 ran successfully.")
else:
    # Encoder Definition
    encoder_inputs = Input(shape=(None,), name='encoder_input')
    enc_emb = Embedding(input_vocab_size, EMBEDDING_DIM, mask_zero=True, name='encoder_embedding')(encoder_inputs)
    encoder_lstm = LSTM(LATENT_DIM, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c] # Encoder outputs its final states to be used as decoder initial states

    # Decoder Definition
    decoder_inputs = Input(shape=(None,), name='decoder_input')
    dec_emb = Embedding(target_vocab_size, EMBEDDING_DIM, mask_zero=True, name='decoder_embedding')(decoder_inputs)
    decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    decoder_dense = Dense(target_vocab_size, activation='softmax', name='decoder_output')
    final_decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], final_decoder_outputs)

    # Compile and train the model
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    print("\nBasic Encoder-Decoder Model Summary:")
    model.summary()

    print(f"\nTraining the basic model for {EPOCHS} epochs...")
    if encoder_input_data.size > 0 and decoder_input_data.size > 0 and decoder_target_data.size > 0:
        history = model.fit(
            [encoder_input_data, decoder_input_data],
            decoder_target_data,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            validation_split=0.2 # Use 20% of data for validation
        )

        print("\nTraining complete. Loss history:")
        for epoch, loss in enumerate(history.history['loss']):
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss:.4f}, Accuracy: {history.history['accuracy'][epoch]:.4f}, Val Loss: {history.history['val_loss'][epoch]:.4f}, Val Accuracy: {history.history['val_accuracy'][epoch]:.4f}")
    else:
        print("SKIPPING Training: Data not prepared from Task 3.")


# ==============================================================================
# --- Task 5: Inference and Evaluation (Basic Encoder-Decoder) ---
# ==============================================================================
print("\n--- Task 5: Inference and Evaluation (Basic Encoder-Decoder) ---")

if model is None or reverse_target_word_index is None or not input_texts or not target_texts or \
   encoder_input_data.size == 0 or max_decoder_seq_length == 0 or target_word_index is None:
    print("SKIPPING Task 5: Dependencies from Task 3 (data, tokenizers) or Task 4 (trained model) not found or are empty.")
else:
    # Build inference (sampling) models
    # Encoder inference model
    encoder_inputs_inf = model.get_layer('encoder_input').input
    encoder_lstm_layer_inf = model.get_layer('encoder_lstm')
    enc_emb_output_inf = model.get_layer('encoder_embedding')(encoder_inputs_inf)
    _, state_h_enc_inf, state_c_enc_inf = encoder_lstm_layer_inf(enc_emb_output_inf)
    encoder_model_inf = Model(encoder_inputs_inf, [state_h_enc_inf, state_c_enc_inf])

    # Decoder inference model
    decoder_inputs_inf = model.get_layer('decoder_input').input
    dec_emb_layer_inf = model.get_layer('decoder_embedding')
    decoder_lstm_layer_inf = model.get_layer('decoder_lstm')
    decoder_dense_layer_inf = model.get_layer('decoder_output')

    decoder_state_input_h_inf = Input(shape=(LATENT_DIM,), name='decoder_state_input_h_inf_task5')
    decoder_state_input_c_inf = Input(shape=(LATENT_DIM,), name='decoder_state_input_c_inf_task5')
    decoder_states_inputs_inf = [decoder_state_input_h_inf, decoder_state_input_c_inf]

    dec_emb2_inf = dec_emb_layer_inf(decoder_inputs_inf)
    decoder_outputs2_inf, state_h2_inf, state_c2_inf = decoder_lstm_layer_inf(dec_emb2_inf, initial_state=decoder_states_inputs_inf)
    decoder_states2_inf = [state_h2_inf, state_c2_inf]
    decoder_outputs2_inf = decoder_dense_layer_inf(decoder_outputs2_inf)

    decoder_model_inference = Model(
        [decoder_inputs_inf] + decoder_states_inputs_inf,
        [decoder_outputs2_inf] + decoder_states2_inf
    )

    def decode_sequence(input_seq):
        states_value = encoder_model_inf.predict(input_seq)
        target_seq = np.zeros((1, 1))
        target_token_index_start = target_word_index.get('<start>', 0)
        if target_token_index_start == 0:
            print("Warning: '<start>' token not found in target vocabulary. Decoding might fail.")
            return ""
        target_seq[0, 0] = target_token_index_start

        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = decoder_model_inference.predict(
                [target_seq] + states_value
            )
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = reverse_target_word_index.get(sampled_token_index, '<unk>')

            if sampled_word == '<end>' or len(decoded_sentence.split()) >= max_decoder_seq_length - 1:
                stop_condition = True
            else:
                decoded_sentence += ' ' + sampled_word

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]
        return decoded_sentence.strip()

    print("\n--- Translating 5 Test Sentences (Basic Model) ---")
    num_loaded_samples = len(input_texts)
    test_indices = [10, 50, 100, 150, 200]
    test_indices = [idx for idx in test_indices if idx < num_loaded_samples]

    if not test_indices:
        print("Not enough samples loaded to perform testing. Try increasing NUM_SAMPLES in Task 3.")
    else:
        for i in test_indices:
            input_seq = encoder_input_data[i:i+1]
            translated_sentence = decode_sequence(input_seq)
            original_english = input_texts[i]
            original_french = target_texts[i].replace('<start> ', '').replace(' <end>', '')

            print(f"\nInput English: {original_english}")
            print(f"Original French: {original_french}")
            print(f"Translated French: {translated_sentence}")


# ==============================================================================
# --- Task 6: Add Basic Attention Mechanism (Optional - Bonus) ---
# ==============================================================================
print("\n--- Task 6: Add Basic Attention Mechanism (Optional - Bonus) ---")

if input_vocab_size == 0 or target_vocab_size == 0:
    print("SKIPPING Task 6: Vocabulary sizes not set. Please ensure Task 3 ran successfully.")
else:
    # Encoder Model Definition (with return_sequences=True for attention)
    encoder_inputs_att = Input(shape=(None,), name='encoder_input_attention')
    enc_emb_att = Embedding(input_vocab_size, EMBEDDING_DIM, mask_zero=True, name='encoder_embedding_attention')(encoder_inputs_att)
    encoder_lstm_att = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='encoder_lstm_attention')
    encoder_outputs_att, state_h_enc_att, state_c_enc_att = encoder_lstm_att(enc_emb_att)
    encoder_states_att = [state_h_enc_att, state_c_enc_att]

    encoder_model_attention_train = Model(encoder_inputs_att, [encoder_outputs_att, state_h_enc_att, state_c_enc_att])
    print("\nEncoder Model for Attention (Training) Summary:")
    encoder_model_attention_train.summary()

    # Decoder Model Definition with Attention
    decoder_inputs_att = Input(shape=(None,), name='decoder_input_attention')
    dec_emb_layer_att = Embedding(target_vocab_size, EMBEDDING_DIM, mask_zero=True, name='decoder_embedding_attention')
    dec_emb_att = dec_emb_layer_att(decoder_inputs_att)


    decoder_lstm_att = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm_attention')
    decoder_outputs_att, _, _ = decoder_lstm_att(dec_emb_att, initial_state=encoder_states_att)

    attention_output = Attention(name='attention_layer')([decoder_outputs_att, encoder_outputs_att])
    decoder_concat_input = Concatenate(axis=-1, name='concat_attention')([decoder_outputs_att, attention_output])

    decoder_dense_layer_att = Dense(target_vocab_size, activation='softmax', name='decoder_output_attention')
    final_decoder_outputs_att = decoder_dense_layer_att(decoder_concat_input)

    model_with_attention = Model([encoder_inputs_att, decoder_inputs_att], final_decoder_outputs_att)

    model_with_attention.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    print("\nModel with Attention Summary (for training):")
    model_with_attention.summary()

    print(f"\nTraining the model with Attention for {EPOCHS} epochs (if encoder_input_data and decoder_input_data are available)...")
    if encoder_input_data.size > 0 and decoder_input_data.size > 0 and decoder_target_data.size > 0:
        # Uncomment the following block to train the model with attention
        # Note: This training will overwrite the 'history' variable if you want Task 7 to plot this history
        # history = model_with_attention.fit(
        #     [encoder_input_data, decoder_input_data],
        #     decoder_target_data,
        #     batch_size=BATCH_SIZE,
        #     epochs=EPOCHS,
        #     validation_split=0.2
        # )
        print("Training for model with attention is commented out. Uncomment to run.")
    else:
        print("SKIPPING Training with attention: Data not prepared from Task 3.")


    # Inference Models for Attention (Conceptual Adaptation of Task 5)
    encoder_inf_model_attention = Model(encoder_inputs_att, [encoder_outputs_att, state_h_enc_att, state_c_enc_att])

    decoder_state_input_h_inf_att = Input(shape=(LATENT_DIM,), name='decoder_state_input_h_inf_att')
    decoder_state_input_c_inf_att = Input(shape=(LATENT_DIM,), name='decoder_state_input_c_inf_att')
    decoder_states_inputs_inf_att = [decoder_state_input_h_inf_att, decoder_state_input_c_inf_att]

    encoder_outputs_as_input_inf_att = Input(shape=(None, LATENT_DIM,), name='encoder_outputs_inf_att')

    dec_emb2_inf_att = dec_emb_layer_att(decoder_inputs_att)

    decoder_outputs2_inf_att, state_h2_inf_att, state_c2_inf_att = decoder_lstm_att(dec_emb2_inf_att, initial_state=decoder_states_inputs_inf_att)
    decoder_states2_inf_att = [state_h2_inf_att, state_c2_inf_att]

    attention_output_inf_att = Attention()([decoder_outputs2_inf_att, encoder_outputs_as_input_inf_att])
    decoder_concat_input_inf_att = Concatenate(axis=-1)([decoder_outputs2_inf_att, attention_output_inf_att])

    decoder_outputs2_inf_att = decoder_dense_layer_att(decoder_concat_input_inf_att)

    decoder_model_attention_inference = Model(
        [decoder_inputs_att] + decoder_states_inputs_inf_att + [encoder_outputs_as_input_inf_att],
        [decoder_outputs2_inf_att] + decoder_states2_inf_att
    )
    print("\nDecoder Model with Attention Summary (for inference):")
    decoder_model_attention_inference.summary()
    print("\nNote: `decode_sequence` function for attention is more complex and not included in this consolidated block for simplicity. It would need to pass encoder_outputs through the decoding loop.")


# ==============================================================================
# --- Task 7: Plotting Loss and Accuracy ---
# ==============================================================================
print("\n--- Task 7: Plotting Loss and Accuracy ---")

if history is None:
    print("SKIPPING Task 7: Model training history ('history' object) not found.")
    print("Please ensure Task 4 (or Task 6 training) has been run successfully to generate training history.")
else:
    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st plot
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.grid(True)

    # Plot training & validation loss values
    plt.subplot(1, 2, 2) # 1 row, 2 columns, 2nd plot
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    print("\n--- Observations on Training Performance ---")
    print("Based on the plots:")
    print("1. Overfitting: Occurs if validation loss starts increasing while training loss continues to decrease significantly, or if validation accuracy plateaus/decreases while training accuracy continues to rise. This indicates the model is memorizing training data.")
    print("2. Underfitting: Occurs if both training and validation loss remain high, or accuracy remains low, suggesting the model hasn't learned enough from the data (e.g., due to insufficient epochs, simple model, or high learning rate).")
    print("3. Training Stability: Indicated by smooth curves in both loss and accuracy plots. Erratic or noisy curves might suggest issues like an unstable learning rate.")
    print("\n(Please observe your generated plots and fill in specific observations based on your model's performance.)")

--- Task 3: Data Preparation ---
'/content/fra.txt' already exists in 'data/'. Skipping download.
Number of sentence pairs loaded: 20000
English (Input) Vocabulary Size: 3275
French (Target) Vocabulary Size: 5498
Max English (Encoder) sequence length: 7
Max French (Decoder) sequence length: 17

--- Data Preparation Summary ---
Shape of Encoder Input Data: (20000, 7)
Shape of Decoder Input Data: (20000, 17)
Shape of Decoder Target Data: (20000, 17)

--- Task 4: Building and Training Model (Basic Encoder-Decoder) ---

Basic Encoder-Decoder Model Summary:



Training the basic model for 20 epochs...
Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 916ms/step - accuracy: 0.5224 - loss: 5.3841 - val_accuracy: 0.2366 - val_loss: 3.7448
Epoch 2/20
[1m 89/250[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m2:13[0m 826ms/step - accuracy: 0.2379 - loss: 3.4016