<a href="https://colab.research.google.com/github/vasuchakravarthi/English_to_Spanish_Translation_Project/blob/main/English_to_Spanish_Translation_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive and check GPU
from google.colab import drive
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Mount Google Drive
#drive.mount('/content/drive')

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if len(tf.config.experimental.list_physical_devices('GPU')) > 0:
    print("✅ GPU is available!")
else:
    print("❌ GPU not available - Enable GPU in Runtime > Change runtime type")

# Install additional packages
!pip install sacrebleu datasets


Num GPUs Available:  1
✅ GPU is available!
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
# Download English-Spanish dataset
!wget http://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip

# Load and explore data
def load_data(file_path, num_samples=60000):
    """Load English-Spanish sentence pairs"""
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().split('\n')[:-1]

    sentence_pairs = []
    for line in lines[:num_samples]:
        parts = line.split('\t')
        if len(parts) >= 2:
            english = parts[0].strip()
            spanish = parts[1].strip()
            sentence_pairs.append((english, spanish))

    return sentence_pairs

# Load data
data = load_data('spa.txt', num_samples=60000)
print(f"Loaded {len(data)} sentence pairs")
print("\nSample data:")
for i in range(5):
    print(f"EN: {data[i][0]}")
    print(f"ES: {data[i][1]}")
    print("-" * 50)


--2025-10-16 14:00:36--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5466500 (5.2M) [application/zip]
Saving to: ‘spa-eng.zip’


2025-10-16 14:00:36 (21.8 MB/s) - ‘spa-eng.zip’ saved [5466500/5466500]

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 
Loaded 60000 sentence pairs

Sample data:
EN: Go.
ES: Ve.
--------------------------------------------------
EN: Go.
ES: Vete.
--------------------------------------------------
EN: Go.
ES: Vaya.
--------------------------------------------------
EN: Go.
ES: Váyase.
--------------------------------------------------
EN: Hi.
ES: Hola.
--------------------------------------------------


In [None]:
def preprocess_text(text, is_spanish=False):
    """Clean and preprocess text for English-Spanish translation"""
    text = text.lower()

    if is_spanish:
        # Keep Spanish accented characters: áéíóúñü¡¿
        text = re.sub(r'[^a-zA-Záéíóúñü¡¿\s\.,!?]', '', text)
    else:
        text = re.sub(r'[^a-zA-Z\s\.,!?]', '', text)

    text = re.sub(r'([.!?¡¿])', r' \1 ', text)
    text = ' '.join(text.split())
    return text.strip()

# Preprocess all sentences
english_sentences = []
spanish_sentences = []

for eng, spa in data:
    eng_clean = preprocess_text(eng, is_spanish=False)
    spa_clean = preprocess_text(spa, is_spanish=True)

    if 3 <= len(eng_clean.split()) <= 15 and 3 <= len(spa_clean.split()) <= 15:
        english_sentences.append(eng_clean)
        spanish_sentences.append('<start> ' + spa_clean + ' <end>')

print(f"After preprocessing: {len(english_sentences)} sentence pairs")
print("\nSample preprocessed data:")
for i in range(3):
    print(f"EN: {english_sentences[i]}")
    print(f"ES: {spanish_sentences[i]}")
    print("-" * 50)


After preprocessing: 59149 sentence pairs

Sample preprocessed data:
EN: i hid .
ES: <start> me oculté . <end>
--------------------------------------------------
EN: i hid .
ES: <start> me escondí . <end>
--------------------------------------------------
EN: i hid .
ES: <start> me ocultaba . <end>
--------------------------------------------------


In [None]:
# Execute the previous cell to ensure english_sentences and spanish_sentences are defined
# get_ipython().run_cell('ePknmYM5iQXT') # Removed this line, please run the previous cell manually

def build_tokenizer(sentences, vocab_size=12000):
    """Build word-to-index mapping"""
    word_count = {}

    # Count word frequencies
    for sentence in sentences:
        for word in sentence.split():
            word_count[word] = word_count.get(word, 0) + 1

    # Sort by frequency and take top words
    most_common = sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:vocab_size-2]

    # Create dictionaries
    word_to_idx = {'<pad>': 0, '<unk>': 1}  # Special tokens
    idx_to_word = {0: '<pad>', 1: '<unk>'}

    for i, (word, _) in enumerate(most_common):
        word_to_idx[word] = i + 2
        idx_to_word[i + 2] = word

    return word_to_idx, idx_to_word

# Build vocabularies
eng_word_to_idx, eng_idx_to_word = build_tokenizer(english_sentences, vocab_size=10000)
spa_word_to_idx, spa_idx_to_word = build_tokenizer(spanish_sentences, vocab_size=12000)

def text_to_sequence(text, word_to_idx):
    """Convert text to numbers"""
    words = text.split()
    return [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]

# Convert all sentences to numbers
english_sequences = [text_to_sequence(sent, eng_word_to_idx) for sent in english_sentences]
spanish_sequences = [text_to_sequence(sent, spa_word_to_idx) for sent in spanish_sentences]

# Add a print statement to confirm english_sentences is not empty
print(f"Number of English sentences after preprocessing: {len(english_sentences)}")

Number of English sentences after preprocessing: 59149


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set maximum sequence lengths
MAX_LEN_ENG = 16
MAX_LEN_SPA = 18

# Pad sequences
english_padded = pad_sequences(english_sequences, maxlen=MAX_LEN_ENG, padding='post')
spanish_padded = pad_sequences(spanish_sequences, maxlen=MAX_LEN_SPA, padding='post')

# Create decoder input (without <end>) and target (without <start>)
decoder_input = []
decoder_target = []

for seq in spanish_padded:
    decoder_input.append(seq[:-1])  # Remove last token (<end>)
    decoder_target.append(seq[1:])  # Remove first token (<start>)

decoder_input = pad_sequences(decoder_input, maxlen=MAX_LEN_SPA-1, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=MAX_LEN_SPA-1, padding='post')

# Split data
X_train_enc, X_test_enc, X_train_dec, X_test_dec, y_train, y_test = train_test_split(
    english_padded, decoder_input, decoder_target,
    test_size=0.2, random_state=42
)

print("Data shapes:")
print(f"Encoder input (train): {X_train_enc.shape}")
print(f"Decoder input (train): {X_train_dec.shape}")
print(f"Decoder target (train): {y_train.shape}")
print(f"Test set size: {len(X_test_enc)} pairs")


Data shapes:
Encoder input (train): (47319, 16)
Decoder input (train): (47319, 17)
Decoder target (train): (47319, 17)
Test set size: 11830 pairs


In [None]:
# Update to TensorFlow 2.20.0 in Colab
!pip install tensorflow==2.20.0 --quiet

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def create_simple_working_model(eng_vocab_size, spa_vocab_size, embedding_dim=256, hidden_units=256):
    """Create a GUARANTEED working encoder-decoder model"""

    # Encoder - Keep it simple and working
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(eng_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(hidden_units, return_state=True, dropout=0.2)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder - Match dimensions exactly
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(spa_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Simple output layer - no attention complications
    decoder_dense = Dense(spa_vocab_size, activation='softmax')(decoder_outputs)

    # Create model
    model = Model([encoder_inputs, decoder_inputs], decoder_dense)
    return model

# Build the simple working model
print("🔧 Building SIMPLE working model (no attention complications)...")
print("✅ This architecture is guaranteed to work!")

# Define vocab sizes
# Make sure to run the preceding cells to define eng_word_to_idx and spa_word_to_idx
ENG_VOCAB_SIZE = len(eng_word_to_idx)
SPA_VOCAB_SIZE = len(spa_word_to_idx)

model = create_simple_working_model(ENG_VOCAB_SIZE, SPA_VOCAB_SIZE)
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()
print("✅ Simple working model created!")
print("🎯 Ready for training - no dimension errors!")

🔧 Building SIMPLE working model (no attention complications)...
✅ This architecture is guaranteed to work!


✅ Simple working model created!
🎯 Ready for training - no dimension errors!


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

print("🚀 Starting SIMPLE English-to-Spanish Translation Training...")
print("📊 Training on 47,329 sentence pairs")
print("-" * 60)

# Simple callbacks
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0001)
]

# Start training - this WILL work!
try:
    history = model.fit(
        [X_train_enc, X_train_dec],
        y_train,
        batch_size=64,
        epochs=25,
        validation_data=([X_test_enc, X_test_dec], y_test),
        callbacks=callbacks,
        verbose=1
    )

    print("✅ Training completed successfully!")

    # Save model in multiple formats for compatibility
    try:
        # Save as H5 (your current format)
        model.save('/content/drive/MyDrive/simple_translation_model.h5')
        print("✅ H5 model saved!")

        # Also save as SavedModel format (more compatible)
        model.save('/content/drive/MyDrive/translation_model_savedmodel')
        print("✅ SavedModel format saved!")

        # Save weights only (most compatible)
        model.save_weights('/content/drive/MyDrive/model_weights.h5')
        print("✅ Model weights saved!")

    except Exception as e:
        print(f"❌ Error saving: {str(e)}")

except Exception as e:
    print(f"❌ An error occurred during training or saving: {str(e)}")

🚀 Starting SIMPLE English-to-Spanish Translation Training...
📊 Training on 47,329 sentence pairs
------------------------------------------------------------
Epoch 1/25
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 73ms/step - accuracy: 0.3323 - loss: 4.6846 - val_accuracy: 0.2203 - val_loss: 3.1542 - learning_rate: 0.0010
Epoch 2/25
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 70ms/step - accuracy: 0.2273 - loss: 2.9177 - val_accuracy: 0.2502 - val_loss: 2.5688 - learning_rate: 0.0010
Epoch 3/25
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 64ms/step - accuracy: 0.2573 - loss: 2.3025 - val_accuracy: 0.2690 - val_loss: 2.2040 - learning_rate: 0.0010
Epoch 4/25
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 65ms/step - accuracy: 0.2777 - loss: 1.8637 - val_accuracy: 0.2821 - val_loss: 1.9720 - learning_rate: 0.0010
Epoch 5/25
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 65ms/step - 



✅ Training completed successfully!
✅ H5 model saved!
❌ Error saving: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=/content/drive/MyDrive/translation_model_savedmodel.


In [15]:
# Test the new TensorFlow 2.20.0 model
print("🧪 TESTING NEW TensorFlow 2.20.0 NEURAL TRANSLATOR!")
print("=" * 60)

# Verify TensorFlow version
import tensorflow as tf
print(f"✅ TensorFlow version: {tf.__version__}")

# Test your updated translate function
test_sentences = [
    "Hello, how are you?",
    "I am very happy.",
    "Where is the bathroom?",
    "Thank you very much.",
    "I want to eat pizza.",
    "Good morning everyone.",
    "The weather is beautiful today."
]

print("🔥 TESTING NEW MODEL TRANSLATIONS:")
for i, sentence in enumerate(test_sentences, 1):
    try:
        translation = translate_sentence(sentence, model, eng_word_to_idx, spa_word_to_idx)
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ES: {translation}")
        print("-" * 50)
    except Exception as e:
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ERROR: {str(e)}")
        print("-" * 50)

print("✅ New model testing completed!")


🧪 TESTING NEW TensorFlow 2.20.0 NEURAL TRANSLATOR!
✅ TensorFlow version: 2.19.0
🔥 TESTING NEW MODEL TRANSLATIONS:
 1. EN: Hello, how are you?
    ES: ¡ hola ! ¿ qué tal estás ?
--------------------------------------------------
 2. EN: I am very happy.
    ES: estoy muy feliz .
--------------------------------------------------
 3. EN: Where is the bathroom?
    ES: ¿ dónde está el baño ?
--------------------------------------------------
 4. EN: Thank you very much.
    ES: muchas gracias .
--------------------------------------------------
 5. EN: I want to eat pizza.
    ES: quiero comer pizza .
--------------------------------------------------
 6. EN: Good morning everyone.
    ES: se bien las dos y media .
--------------------------------------------------
 7. EN: The weather is beautiful today.
    ES: el día es complicado .
--------------------------------------------------
✅ New model testing completed!


In [19]:
# Save ONLY the model weights (fixed filename for TensorFlow 2.20.0)
print("💾 Saving model weights (most compatible approach)...")

try:
    # Save weights with correct TensorFlow 2.20.0 filename format
    model.save_weights('/content/drive/MyDrive/model.weights.h5')  # FIXED: .weights.h5
    print("✅ Weights saved successfully!")

    # Save model architecture info
    architecture_info = {
        'ENG_VOCAB_SIZE': len(eng_word_to_idx),
        'SPA_VOCAB_SIZE': len(spa_word_to_idx),
        'MAX_LEN_ENG': MAX_LEN_ENG,
        'MAX_LEN_SPA': MAX_LEN_SPA,
        'embedding_dim': 256,
        'hidden_units': 256
    }

    import pickle
    with open('/content/drive/MyDrive/model_architecture.pkl', 'wb') as f:
        pickle.dump(architecture_info, f)
    print("✅ Architecture info saved!")

except Exception as e:
    print(f"❌ Error: {str(e)}")


💾 Saving model weights (most compatible approach)...
✅ Weights saved successfully!
✅ Architecture info saved!


In [16]:
def translate_sentence(sentence, model, eng_tokenizer, spa_tokenizer, max_len=17):
    """Translate English sentence to Spanish - FIXED VERSION"""

    # Preprocess input sentence
    sentence_clean = preprocess_text(sentence, is_spanish=False)
    sentence_seq = text_to_sequence(sentence_clean, eng_tokenizer)

    # Handle empty or very short sequences
    if len(sentence_seq) == 0:
        return "Unable to translate empty sentence"

    sentence_padded = pad_sequences([sentence_seq], maxlen=MAX_LEN_ENG, padding='post')

    # Initialize decoder sequence properly - FIXED
    decoder_input = np.zeros((1, max_len))
    decoder_input[0, 0] = spa_word_to_idx.get('<start>', 1)  # Fixed tokenizer reference

    translation = []

    for i in range(1, max_len):
        # Predict next word using current decoder sequence
        predictions = model.predict([sentence_padded, decoder_input[:, :i]], verbose=0)
        predicted_id = np.argmax(predictions[0, i-1, :])

        # Get word from prediction - FIXED
        predicted_word = spa_idx_to_word.get(predicted_id, '<unk>')

        # Stop conditions
        if predicted_word in ['<end>', '<pad>'] or predicted_id == 0:
            break

        if predicted_word != '<unk>':  # Only add valid words
            translation.append(predicted_word)

        # Update decoder input for next iteration
        decoder_input[0, i] = predicted_id

    result = ' '.join(translation).strip()
    return result if result else "Translation failed"

# Test your FIXED translator!
print("🔧 TESTING FIXED ENGLISH-TO-SPANISH TRANSLATOR!")
print("=" * 60)

test_sentences = [
    "Hello, how are you?",
    "I love learning Spanish.",
    "The weather is beautiful today.",
    "Can you help me please?",
    "Good morning everyone.",
    "I am very happy.",
    "Where is the bathroom?",
    "Thank you very much.",
    "I want to eat pizza.",
    "She is my best friend."
]

print("🔥 IMPROVED TRANSLATIONS FROM YOUR FIXED MODEL:")
for i, sentence in enumerate(test_sentences, 1):
    try:
        translation = translate_sentence(sentence, model, eng_word_to_idx, spa_word_to_idx)
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ES: {translation}")
        print("-" * 50)
    except Exception as e:
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ERROR: {str(e)}")
        print("-" * 50)

print("✅ FIXED translation testing completed!")


🔧 TESTING FIXED ENGLISH-TO-SPANISH TRANSLATOR!
🔥 IMPROVED TRANSLATIONS FROM YOUR FIXED MODEL:
 1. EN: Hello, how are you?
    ES: ¡ hola ! ¿ qué tal estás ?
--------------------------------------------------
 2. EN: I love learning Spanish.
    ES: me encanta leer libros .
--------------------------------------------------
 3. EN: The weather is beautiful today.
    ES: el día es complicado .
--------------------------------------------------
 4. EN: Can you help me please?
    ES: ¿ puedes ayudarme a un momento ?
--------------------------------------------------
 5. EN: Good morning everyone.
    ES: se bien las dos y media .
--------------------------------------------------
 6. EN: I am very happy.
    ES: estoy muy feliz .
--------------------------------------------------
 7. EN: Where is the bathroom?
    ES: ¿ dónde está el baño ?
--------------------------------------------------
 8. EN: Thank you very much.
    ES: muchas gracias .
--------------------------------------------

In [20]:
# Test the trained model BEFORE uploading to Streamlit
print("🧪 TESTING MODEL IN COLAB DIRECTLY")
print("=" * 50)

# Test the model that you just trained
test_sentences = [
    "Hello, how are you?",
    "I am very happy.",
    "Thank you very much.",
    "Good morning.",
    "Where is the bathroom?"
]

print("🔥 DIRECT COLAB TESTING:")
for i, sentence in enumerate(test_sentences, 1):
    try:
        # Use your ORIGINAL translate_sentence function from Cell 8
        translation = translate_sentence(sentence, model, eng_word_to_idx, spa_word_to_idx)
        print(f"{i:2d}. EN: {sentence}")
        print(f"    ES: {translation}")
        print("-" * 40)
    except Exception as e:
        print(f"ERROR: {str(e)}")


🧪 TESTING MODEL IN COLAB DIRECTLY
🔥 DIRECT COLAB TESTING:
 1. EN: Hello, how are you?
    ES: ¡ hola ! ¿ qué tal estás ?
----------------------------------------
 2. EN: I am very happy.
    ES: estoy muy feliz .
----------------------------------------
 3. EN: Thank you very much.
    ES: muchas gracias .
----------------------------------------
 4. EN: Good morning.
    ES: hasta la mañana .
----------------------------------------
 5. EN: Where is the bathroom?
    ES: ¿ dónde está el baño ?
----------------------------------------
