# Import Required Libraries

In [None]:
import numpy as np
np.random.seed(42)

import pandas as pd
pd.set_option("display.float_format", lambda x: "%.4f" % x)
pd.set_option("display.max_colwidth", None)

import string
import re
import nltk
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, RepeatVector, TimeDistributed, Dropout
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from nltk.translate.bleu_score import corpus_bleu

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
df = pd.read_csv('/mnt/d/Datasets/eng_-french.csv', nrows=20000)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df.shape

# EDA

In [None]:
def word_freq(data, label, title):
    freq_df = data[[label]]
    freq_words = freq_df[label].values
    freq_words = [i.lower() for i in freq_words]
    freq_punc = []

    for o in freq_words:
        freq_punc += nltk.word_tokenize(o)

    freq_punc = [o for o in freq_punc if o not in string.punctuation]
    freq_freq = Counter(freq_punc)

    freq_top = freq_freq.most_common(50)

    words = [word for word, _ in freq_top]
    counts = [count for _, count in freq_top]

    plt.figure(figsize=(15, 25))
    plt.barh(words, counts)
    plt.title(title)
    plt.xlabel("Frequency")
    plt.ylabel("Words")

    return freq_top

In [None]:
en_freq_top = word_freq(df, "English words/sentences", "English Top 50 Words - Before Cleaning")

In [None]:
fr_freq_top = word_freq(df, "French words/sentences", "French Top 50 Words - Before Cleaning")

# Preprocess

In [None]:
def clean_text(text):
    text = text.replace("\u202f", " ")
    text = text.lower()
    text = re.sub(r"\d", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    return text

In [None]:
df["English words/sentences"] = df["English words/sentences"].apply(clean_text)
df["French words/sentences"] = df["French words/sentences"].apply(clean_text)

In [None]:
data_array = df.values

In [None]:
train_size = int(0.9 * len(data_array))

In [None]:
train_data = data_array[:train_size]
test_data = data_array[train_size:]

In [None]:
idx_en = 0
idx_fr = 1

In [None]:
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(data_array[:, idx_en])
en_vocab_size = len(en_tokenizer.word_index) + 1
en_maxlen = max(len(data.split()) for data in data_array[:, idx_en])

print("English Vocabulary Size:", en_vocab_size)
print("English Max Length:", en_maxlen)

In [None]:
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(data_array[:, idx_fr])
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_maxlen = max(len(data.split()) for data in data_array[:, idx_fr])

print("French Vocabulary Size:", fr_vocab_size)
print("French Max Length:", fr_maxlen)

In [None]:
X_train = en_tokenizer.texts_to_sequences(train_data[:, idx_en])
X_train = pad_sequences(X_train, maxlen=en_maxlen, padding='post')

In [None]:
y_train = fr_tokenizer.texts_to_sequences(train_data[:, idx_fr])
y_train = pad_sequences(y_train, maxlen=fr_maxlen, padding='post')
y_train = np.array([to_categorical(seq, num_classes=fr_vocab_size) for seq in y_train])

In [None]:
X_test = en_tokenizer.texts_to_sequences(test_data[:, idx_en])
X_test = pad_sequences(X_test, maxlen=en_maxlen, padding='post')

In [None]:
y_test = fr_tokenizer.texts_to_sequences(test_data[:, idx_fr])
y_test = pad_sequences(y_test, maxlen=fr_maxlen, padding='post')
y_test = np.array([to_categorical(seq, num_classes=fr_vocab_size) for seq in y_test])

# Model

In [None]:
model = Sequential()
model.add(Embedding(en_vocab_size, 512, input_length=en_maxlen, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(fr_maxlen))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(512, activation='relu')))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(fr_vocab_size, activation='softmax')))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=5)
checkpoint = ModelCheckpoint(filepath=f'en_to_fr.h5', monitor='val_loss', verbose=0, save_best_only=True, mode='min')

In [None]:
history = model.fit(
    X_train, y_train, 
    epochs=100, 
    batch_size=64, 
    validation_split=0.1, 
    callbacks=[early_stopping, checkpoint]
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
plt.plot(history_df["loss"])
plt.plot(history_df["val_loss"])
plt.title("Loss")
plt.legend(["loss", "val_loss"])
plt.show()

In [None]:
model = load_model("en_to_fr.h5")

# Translate

In [None]:
def translate(text):
    text = en_tokenizer.texts_to_sequences([text])
    text = pad_sequences(text, maxlen=en_maxlen, padding='post')
    prediction = model.predict(text, verbose=0)[0]
    integers = [np.argmax(preds) for preds in prediction]
    target = [fr_tokenizer.index_word.get(integer, None) for integer in integers]
    translated = ' '.join([t for t in target if t != None])
    return translated

In [None]:
translate('i snore')

In [None]:
raw, actual_results, predicted_results = [], [], []
bleu_scores = []
for i, text in enumerate(X_test):
    text = text.reshape((1, text.shape[0]))
    prediction = model.predict(text, verbose=0)[0]
    integers = [np.argmax(preds) for preds in prediction]
    target = [fr_tokenizer.index_word.get(integer, None) for integer in integers]
    translated = ' '.join([t for t in target if t != None])
    predicted_results.append(translated)
    actual_results.append(test_data[i, idx_fr])
    raw.append(test_data[i, idx_en])
    bleu_scores.append(corpus_bleu([test_data[i, idx_fr].split()], [translated.split()],  weights=(1.0, 0, 0, 0)))

# Results

In [None]:
result_df = pd.DataFrame({
    "English": raw,
    "French (Actual)": actual_results,
    "French (Predicted)": predicted_results,
    "BLEU": bleu_scores,
})

In [None]:
result_df.sample(5)