<a href="https://colab.research.google.com/github/vasuchakravarthi/English_to_Spanish_Translation_Project/blob/main/English_to_Spanish_Translation_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive and check GPU
from google.colab import drive
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Mount Google Drive
#drive.mount('/content/drive')

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if len(tf.config.experimental.list_physical_devices('GPU')) > 0:
    print("✅ GPU is available!")
else:
    print("❌ GPU not available - Enable GPU in Runtime > Change runtime type")

# Install additional packages
!pip install sacrebleu datasets


In [None]:
# Download English-Spanish dataset
!wget http://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip

# Load and explore data
def load_data(file_path, num_samples=60000):
    """Load English-Spanish sentence pairs"""
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().split('\n')[:-1]

    sentence_pairs = []
    for line in lines[:num_samples]:
        parts = line.split('\t')
        if len(parts) >= 2:
            english = parts[0].strip()
            spanish = parts[1].strip()
            sentence_pairs.append((english, spanish))

    return sentence_pairs

# Load data
data = load_data('spa.txt', num_samples=60000)
print(f"Loaded {len(data)} sentence pairs")
print("\nSample data:")
for i in range(5):
    print(f"EN: {data[i][0]}")
    print(f"ES: {data[i][1]}")
    print("-" * 50)


In [None]:
def preprocess_text(text, is_spanish=False):
    """Clean and preprocess text for English-Spanish translation"""
    text = text.lower()

    if is_spanish:
        # Keep Spanish accented characters: áéíóúñü¡¿
        text = re.sub(r'[^a-zA-Záéíóúñü¡¿\s\.,!?]', '', text)
    else:
        text = re.sub(r'[^a-zA-Z\s\.,!?]', '', text)

    text = re.sub(r'([.!?¡¿])', r' \1 ', text)
    text = ' '.join(text.split())
    return text.strip()

# Preprocess all sentences
english_sentences = []
spanish_sentences = []

for eng, spa in data:
    eng_clean = preprocess_text(eng, is_spanish=False)
    spa_clean = preprocess_text(spa, is_spanish=True)

    if 3 <= len(eng_clean.split()) <= 15 and 3 <= len(spa_clean.split()) <= 15:
        english_sentences.append(eng_clean)
        spanish_sentences.append('<start> ' + spa_clean + ' <end>')

print(f"After preprocessing: {len(english_sentences)} sentence pairs")
print("\nSample preprocessed data:")
for i in range(3):
    print(f"EN: {english_sentences[i]}")
    print(f"ES: {spanish_sentences[i]}")
    print("-" * 50)


In [None]:
# Execute the previous cell to ensure english_sentences and spanish_sentences are defined
# get_ipython().run_cell('ePknmYM5iQXT') # Removed this line, please run the previous cell manually

def build_tokenizer(sentences, vocab_size=12000):
    """Build word-to-index mapping"""
    word_count = {}

    # Count word frequencies
    for sentence in sentences:
        for word in sentence.split():
            word_count[word] = word_count.get(word, 0) + 1

    # Sort by frequency and take top words
    most_common = sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:vocab_size-2]

    # Create dictionaries
    word_to_idx = {'<pad>': 0, '<unk>': 1}  # Special tokens
    idx_to_word = {0: '<pad>', 1: '<unk>'}

    for i, (word, _) in enumerate(most_common):
        word_to_idx[word] = i + 2
        idx_to_word[i + 2] = word

    return word_to_idx, idx_to_word

# Build vocabularies
eng_word_to_idx, eng_idx_to_word = build_tokenizer(english_sentences, vocab_size=10000)
spa_word_to_idx, spa_idx_to_word = build_tokenizer(spanish_sentences, vocab_size=12000)

def text_to_sequence(text, word_to_idx):
    """Convert text to numbers"""
    words = text.split()
    return [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]

# Convert all sentences to numbers
english_sequences = [text_to_sequence(sent, eng_word_to_idx) for sent in english_sentences]
spanish_sequences = [text_to_sequence(sent, spa_word_to_idx) for sent in spanish_sentences]

# Add a print statement to confirm english_sentences is not empty
print(f"Number of English sentences after preprocessing: {len(english_sentences)}")

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set maximum sequence lengths
MAX_LEN_ENG = 16
MAX_LEN_SPA = 18

# Pad sequences
english_padded = pad_sequences(english_sequences, maxlen=MAX_LEN_ENG, padding='post')
spanish_padded = pad_sequences(spanish_sequences, maxlen=MAX_LEN_SPA, padding='post')

# Create decoder input (without <end>) and target (without <start>)
decoder_input = []
decoder_target = []

for seq in spanish_padded:
    decoder_input.append(seq[:-1])  # Remove last token (<end>)
    decoder_target.append(seq[1:])  # Remove first token (<start>)

decoder_input = pad_sequences(decoder_input, maxlen=MAX_LEN_SPA-1, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=MAX_LEN_SPA-1, padding='post')

# Split data
X_train_enc, X_test_enc, X_train_dec, X_test_dec, y_train, y_test = train_test_split(
    english_padded, decoder_input, decoder_target,
    test_size=0.2, random_state=42
)

print("Data shapes:")
print(f"Encoder input (train): {X_train_enc.shape}")
print(f"Decoder input (train): {X_train_dec.shape}")
print(f"Decoder target (train): {y_train.shape}")
print(f"Test set size: {len(X_test_enc)} pairs")


In [None]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def create_simple_working_model(eng_vocab_size, spa_vocab_size, embedding_dim=256, hidden_units=256):
    """Create a GUARANTEED working encoder-decoder model"""

    # Encoder - Keep it simple and working
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(eng_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(hidden_units, return_state=True, dropout=0.2)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder - Match dimensions exactly
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(spa_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Simple output layer - no attention complications
    decoder_dense = Dense(spa_vocab_size, activation='softmax')(decoder_outputs)

    # Create model
    model = Model([encoder_inputs, decoder_inputs], decoder_dense)
    return model

# Build the simple working model
print("🔧 Building SIMPLE working model (no attention complications)...")
print("✅ This architecture is guaranteed to work!")

# Define vocab sizes
ENG_VOCAB_SIZE = len(eng_word_to_idx)
SPA_VOCAB_SIZE = len(spa_word_to_idx)

model = create_simple_working_model(ENG_VOCAB_SIZE, SPA_VOCAB_SIZE)
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()
print("✅ Simple working model created!")
print("🎯 Ready for training - no dimension errors!")

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

print("🚀 Starting SIMPLE English-to-Spanish Translation Training...")
print("⏰ Estimated time: 2-3 hours (simpler model = faster)")
print("📊 Training on 47,329 sentence pairs")
print("🎯 Target: BLEU score 20-30 (excellent for academic project)")
print("-" * 60)

# Simple callbacks
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0001)
]

# Start training - this WILL work!
try:
    history = model.fit(
        [X_train_enc, X_train_dec],
        y_train,
        batch_size=64,
        epochs=25,
        validation_data=([X_test_enc, X_test_dec], y_test),
        callbacks=callbacks,
        verbose=1
    )

    print("✅ Training completed successfully!")

    # Save model manually
    model.save('/content/drive/MyDrive/NLP Pro/simple_translation_model.h5')
    print("💾 Model saved to Google Drive")

except Exception as e:
    print(f"❌ Error: {str(e)}")
    print("🔧 Let's try even simpler approach...")

In [None]:
def translate_sentence(sentence, model, eng_tokenizer, spa_tokenizer, max_len=18):
    """Translate English sentence to Spanish"""

    # Preprocess input
    sentence_clean = preprocess_text(sentence, is_spanish=False)
    sentence_seq = text_to_sequence(sentence_clean, eng_tokenizer)
    sentence_padded = pad_sequences([sentence_seq], maxlen=MAX_LEN_ENG, padding='post')

    # Initialize decoder input
    target_seq = np.zeros((1, max_len))
    target_seq[0, 0] = spa_tokenizer.get('<start>', 1)

    # Translate word by word
    translation = []

    for i in range(1, max_len):
        # Predict next word
        output = model.predict([sentence_padded, target_seq[:, :i]], verbose=0)
        predicted_id = np.argmax(output[0, i-1, :])

        # Get word from ID
        predicted_word = spa_idx_to_word.get(predicted_id, '<unk>')

        # Stop if we hit end token
        if predicted_word == '<end>' or predicted_word == '<unk>':
            break

        translation.append(predicted_word)
        target_seq[0, i] = predicted_id

    return ' '.join(translation)

# Test your translator!
print("🌟 TESTING YOUR ENGLISH-TO-SPANISH TRANSLATOR!")
print("=" * 60)

test_sentences = [
    "Hello, how are you?",
    "I love learning Spanish.",
    "The weather is beautiful today.",
    "Can you help me please?",
    "Good morning everyone.",
    "I am very happy.",
    "Where is the bathroom?",
    "Thank you very much.",
    "I want to eat pizza.",
    "She is my best friend."
]

print("🔥 FIRST TRANSLATIONS FROM YOUR MODEL:")
for i, sentence in enumerate(test_sentences, 1):
    try:
        translation = translate_sentence(sentence, model, eng_word_to_idx, spa_word_to_idx)
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ES: {translation}")
        print("-" * 50)
    except Exception as e:
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ERROR: {str(e)}")
        print("-" * 50)

print("✅ Translation testing completed!")


In [None]:
import matplotlib.pyplot as plt

def plot_training_results(history):
    """Plot training progress"""

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Plot loss
    ax1.plot(history.history['loss'], 'b-', label='Training Loss', linewidth=2)
    ax1.plot(history.history['val_loss'], 'r-', label='Validation Loss', linewidth=2)
    ax1.set_title('Translation Model Loss', fontsize=14)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Plot accuracy
    ax2.plot(history.history['accuracy'], 'g-', label='Training Accuracy', linewidth=2)
    ax2.plot(history.history['val_accuracy'], 'orange', label='Validation Accuracy', linewidth=2)
    ax2.set_title('Translation Model Accuracy', fontsize=14)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Print summary
    print("📊 TRAINING SUMMARY:")
    print(f"• Epochs completed: {len(history.history['loss'])}")
    print(f"• Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"• Final validation loss: {history.history['val_loss'][-1]:.4f}")
    print(f"• Final training accuracy: {history.history['accuracy'][-1]:.1%}")
    print(f"• Final validation accuracy: {history.history['val_accuracy'][-1]:.1%}")
    print(f"• Loss reduction: {((history.history['loss'][0] - history.history['loss'][-1]) / history.history['loss'][0] * 100):.1f}%")

# Visualize your results
plot_training_results(history)


In [None]:
from sacrebleu import corpus_bleu

def calculate_bleu_score(num_samples=1000):
    """Calculate professional BLEU score evaluation"""

    print("📊 PROFESSIONAL EVALUATION - BLEU Score Calculation")
    print("=" * 60)

    predictions = []
    references = []

    # Get diverse test samples
    test_indices = np.random.choice(len(X_test_enc), min(num_samples, len(X_test_enc)), replace=False)

    for i in test_indices:
        # Get English sentence
        eng_seq = X_test_enc[i]
        eng_words = [eng_idx_to_word.get(idx, '') for idx in eng_seq if idx != 0]
        eng_sentence = ' '.join(eng_words).strip()

        if len(eng_sentence) > 3:  # Only meaningful sentences
            # Get reference Spanish
            spa_seq = y_test[i]
            spa_words = [spa_idx_to_word.get(idx, '') for idx in spa_seq
                        if idx not in [0, spa_word_to_idx.get('<start>', -1)]]
            reference = ' '.join(spa_words).replace('<end>', '').strip()

            # Get model prediction
            try:
                prediction = translate_sentence(eng_sentence, model, eng_word_to_idx, spa_word_to_idx)

                if prediction and reference:
                    predictions.append(prediction)
                    references.append([reference])  # BLEU expects list of references

            except:
                continue

    # Calculate BLEU score
    if predictions and references:
        bleu = corpus_bleu(predictions, references)

        print(f"🎯 BLEU Score: {bleu.score:.2f}")
        print(f"📈 Evaluation Samples: {len(predictions)}")

        # Professional interpretation
        if bleu.score >= 35:
            quality = "EXCELLENT (Commercial-grade)"
            emoji = "🏆"
        elif bleu.score >= 25:
            quality = "VERY GOOD (Professional-level)"
            emoji = "🥇"
        elif bleu.score >= 20:
            quality = "GOOD (Strong portfolio project)"
            emoji = "🥈"
        elif bleu.score >= 15:
            quality = "ACCEPTABLE (Demonstrates competency)"
            emoji = "🥉"
        else:
            quality = "NEEDS IMPROVEMENT"
            emoji = "⚠️"

        print(f"{emoji} Quality Rating: {quality}")

        # Show sample comparisons
        print(f"\n📋 SAMPLE EVALUATION:")
        for i in range(min(5, len(predictions))):
            print(f"EN: {test_sentences[i] if i < len(test_sentences) else 'Test sentence'}")
            print(f"Predicted: {predictions[i]}")
            print(f"Reference: {references[i][0]}")
            print("-" * 40)

        return bleu.score
    else:
        print("❌ Could not calculate BLEU score - no valid predictions")
        return 0

# Calculate your professional BLEU score
professional_bleu = calculate_bleu_score(1000)


In [None]:
# Cell 11: Download Professional Model for Portfolio Demonstration
print("=" * 70)

# Install required packages
!pip install transformers torch sentencepiece --quiet

from transformers import MarianMTModel, MarianTokenizer
import time

print("📥 Downloading Helsinki-NLP Professional Translation Model...")
print("🎯 BLEU Score: 59.6 (Commercial-grade for comparison)")

start_time = time.time()

# Download professional model
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
professional_model = MarianMTModel.from_pretrained(model_name)

end_time = time.time()
print(f"✅ Professional model downloaded in {end_time - start_time:.1f} seconds!")
print("🏆 59.6 BLEU Commercial-Grade Translator Ready!")
