In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import files
uploaded=files.upload()

Saving english_french.csv to english_french.csv


In [3]:
# Load the dataset
data = pd.read_csv('english_french.csv')

In [4]:
# Data Cleaning
def clean_text(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    return text

data['English'] = data['English'].apply(clean_text)
data['French'] = data['French'].apply(clean_text)

In [5]:
# Tokenization and Sequences
def preprocess_data(texts, num_words):
    tokenizer = Tokenizer(num_words=num_words, filters='', lower=True)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    return tokenizer, sequences

In [6]:
# Hyperparameters
num_words = 10000
max_len = 20

In [7]:
# English
eng_tokenizer, eng_sequences = preprocess_data(data['English'], num_words)
eng_word_index = eng_tokenizer.word_index
eng_padded = pad_sequences(eng_sequences, maxlen=max_len, padding='post')

# French
fr_tokenizer, fr_sequences = preprocess_data(data['French'], num_words)
fr_word_index = fr_tokenizer.word_index
fr_padded = pad_sequences(fr_sequences, maxlen=max_len, padding='post')

In [8]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(eng_padded, fr_padded, test_size=0.2, random_state=42)

In [11]:
# Encoder-Decoder Model
embedding_dim = 256
units = 256

# Encoder
encoder_inputs = Input(shape=(max_len,))  # Input shape (batch_size, max_len)
encoder_embedding = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_len)(encoder_inputs)  # Add Embedding layer
encoder_lstm, state_h, state_c = LSTM(units, return_state=True)(encoder_embedding)  # Output shape is now 3D
encoder_states = [state_h, state_c]


# Decoder
decoder_inputs = Input(shape=(max_len,))  # Input shape (batch_size, max_len)
decoder_embedding = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim)(decoder_inputs)  # Add Embedding layer
decoder_lstm, _, _ = LSTM(units, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)  # Ensure the LSTM receives 3D input
decoder_dense = Dense(num_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)


# Compile Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [12]:
# Train Model
y_train = np.expand_dims(y_train, axis=-1)  # Reshape target data
y_test = np.expand_dims(y_test, axis=-1)


In [13]:
history = model.fit([X_train, X_train], y_train, epochs=20, batch_size=64, validation_data=([X_test, X_test], y_test))

Epoch 1/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 47ms/step - accuracy: 0.7209 - loss: 2.1383 - val_accuracy: 0.7625 - val_loss: 1.4856
Epoch 2/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 47ms/step - accuracy: 0.7698 - loss: 1.3918 - val_accuracy: 0.7818 - val_loss: 1.2380
Epoch 3/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 49ms/step - accuracy: 0.7883 - loss: 1.1502 - val_accuracy: 0.7924 - val_loss: 1.1120
Epoch 4/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 47ms/step - accuracy: 0.8003 - loss: 1.0007 - val_accuracy: 0.7986 - val_loss: 1.0384
Epoch 5/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 49ms/step - accuracy: 0.8116 - loss: 0.8909 - val_accuracy: 0.8028 - val_loss: 0.9936
Epoch 6/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 49ms/step - accuracy: 0.8189 - loss: 0.8196 - val_accuracy: 0.8060 - val_loss: 0.970

In [15]:
# Save the Model
model.save('english_to_french_translation_model.h5')



In [16]:
# Translation Function
def translate_sentence(sentence, model, tokenizer_src, tokenizer_tgt):
    sequence = tokenizer_src.texts_to_sequences([sentence])
    sequence_padded = pad_sequences(sequence, maxlen=max_len, padding='post')
    pred_sequence = model.predict([sequence_padded, sequence_padded])
    pred_indices = np.argmax(pred_sequence, axis=-1)
    pred_words = [tokenizer_tgt.index_word.get(idx, '') for idx in pred_indices[0]]
    return ' '.join(pred_words)

# Example Translation
english_sentence = "How are you?"
translation = translate_sentence(english_sentence, model, eng_tokenizer, fr_tokenizer)
print("Translated Sentence:", translation)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
Translated Sentence: comment vas-tu ?                  
