In [None]:
# this notebook translates text from english to french

In [None]:
# installing transformers

In [1]:
!pip install transformers datasets sacrebleu sentencepiece tensorflow --quiet

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-server 2.16.0 requires pywinpty>=2.0.1; os_name == "nt", which is not installed.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset

In [4]:
from transformers import (
    AutoTokenizer,
    TFAutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    create_optimizer
)




In [None]:
# using hugging face auto-download dataset

In [6]:
dataset = load_dataset("opus_books", "en-fr")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [None]:
# splitting the dataset into training and testing dataset

In [7]:
train_data = dataset["train"].shuffle(seed=40).select(range(3000))
test_data  = dataset["train"].shuffle(seed=120).select(range(300))

In [9]:
train_data[0]

{'id': '3787',
 'translation': {'en': "The former was divided between admiration of the brilliancy which exercise had given to her complexion, and doubt as to the occasion's justifying her coming so far alone. The latter was thinking only of his breakfast.",
  'fr': 'Le premier, tout en admirant le teint d’Elizabeth avivé par la marche, se demandait s’il y avait réellement motif a ce qu’elle eut fait seule une si longue course ; le second ne pensait qu’a achever son déjeuner.'}}

In [10]:
test_data[0]

{'id': '26183',
 'translation': {'en': '"With gum on to the paper.',
  'fr': '– Avec de la gomme fondue sur le papier à lettres.'}}

In [11]:
en_texts = [item["translation"]["en"] for item in train_data]
fr_texts = [item["translation"]["fr"] for item in train_data]

In [12]:
# preprocessing the text in the dataset - removing all the single letters numbers and symbols and keeping only the space and words

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re

In [16]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text.strip()

In [17]:
en_texts = [preprocess(t) for t in en_texts]

In [18]:
en_texts[0]

"the former was divided between admiration of the brilliancy which exercise had given to her complexion, and doubt as to the occasion's justifying her coming so far alone. the latter was thinking only of his breakfast."

In [19]:
fr_texts = [preprocess(t) for t in fr_texts]

In [20]:
fr_texts[0]

'le premier, tout en admirant le teint d elizabeth aviv par la marche, se demandait s il y avait r ellement motif a ce qu elle eut fait seule une si longue course le second ne pensait qu a achever son d jeuner.'

In [None]:
# tokanizing - converting text to numbers

In [None]:
# english tokenizer

In [21]:
tokenizer_en = keras.preprocessing.text.Tokenizer(filters='')
tokenizer_en.fit_on_texts(en_texts)
input_tensor = tokenizer_en.texts_to_sequences(en_texts)

In [None]:
# french tokenizer

In [22]:
tokenizer_fr = keras.preprocessing.text.Tokenizer(filters='')
tokenizer_fr.fit_on_texts(fr_texts)
target_tensor = tokenizer_fr.texts_to_sequences(fr_texts)

In [None]:
# padding sequences to make same length

In [23]:
input_tensor = keras.utils.pad_sequences(input_tensor, padding='post')
target_tensor = keras.utils.pad_sequences(target_tensor, padding='post')

In [24]:
input_tensor.shape

(3000, 178)

In [25]:
target_tensor.shape

(3000, 233)

In [None]:
# creating a training dataset

In [28]:
batch_size = 64
buffer_size = len(input_tensor)
steps_per_epoch = buffer_size//batch_size

dataset_tf = (
    tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor))
    .shuffle(buffer_size)
    .batch(batch_size, drop_remainder=True)
)

In [None]:
# building seq2seq model using encoder-decoder architecture

In [31]:
embedding_dim = 256
units = 512
vocab_inp_size = len(tokenizer_en.word_index) + 1
vocab_tar_size = len(tokenizer_fr.word_index) + 1

In [34]:
print(type(vocab_inp_size), vocab_inp_size)
print(type(vocab_tar_size), vocab_tar_size)
print(type(embedding_dim), embedding_dim)
print(type(units), units)

<class 'int'> 13167
<class 'int'> 13780
<class 'int'> 256
<class 'int'> 512


In [32]:
# encoder

In [45]:
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input

vocab_inp_size = 13167

encoder_inputs = Input(shape=(None,))
enc_emb = layers.Embedding(input_dim=vocab_inp_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

In [None]:
# decoder

In [49]:
vocab_tar_size = 13780  

decoder_inputs = Input(shape=(None,))  # target sequences (e.g. French sentences)
dec_emb_layer = layers.Embedding(input_dim=vocab_tar_size, output_dim=embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = layers.Dense(vocab_tar_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full model (encoder + decoder)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [53]:
input_tensor = tokenizer_en.texts_to_sequences(en_texts)
target_tensor = tokenizer_fr.texts_to_sequences(fr_texts)

In [None]:
# padding both the sides

In [54]:
input_tensor = keras.utils.pad_sequences(input_tensor, padding='post')
target_tensor = keras.utils.pad_sequences(target_tensor, padding='post')

decoder_input_data = target_tensor[:, :-1]   
decoder_target_data = target_tensor[:, 1:]   

encoder_input_data = input_tensor

In [None]:
# training the model

In [55]:
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=20,
    validation_split=0.2
)

Epoch 1/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 8s/step - accuracy: 0.7588 - loss: 8.2166 - val_accuracy: 0.9040 - val_loss: 7.5629
Epoch 2/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 8s/step - accuracy: 0.9082 - loss: 7.2654 - val_accuracy: 0.9022 - val_loss: 7.4213
Epoch 3/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 10s/step - accuracy: 0.9077 - loss: 7.0055 - val_accuracy: 0.9015 - val_loss: 7.3913
Epoch 4/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 8s/step - accuracy: 0.9072 - loss: 6.8406 - val_accuracy: 0.9050 - val_loss: 7.4030
Epoch 5/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m412s[0m 11s/step - accuracy: 0.9072 - loss: 6.6784 - val_accuracy: 0.9029 - val_loss: 7.4062
Epoch 6/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 6s/step - accuracy: 0.9086 - loss: 6.5162 - val_accuracy: 0.9041 - val_loss: 7.4060
Epoch 7/20
[1m38/38[0m [32m━━

In [None]:
# encoder for inference

In [57]:
encoder_model_inf = Model(encoder_inputs, encoder_states)

In [None]:
# decoder for inference

In [58]:
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model_inf = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

In [None]:
# creating reverse word mappings - converting numbers to words

In [59]:
reverse_input_index = {v: k for k, v in tokenizer_en.word_index.items()}
reverse_target_index = {v: k for k, v in tokenizer_fr.word_index.items()}

In [None]:
# building a fuction to translate

In [60]:
def translate_sequence(input_seq):
    states_value = encoder_model_inf.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_fr.word_index.get('<sos>', 1)

    translation = ''
    for _ in range(50):  # limit to 50 words
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_index.get(sampled_token_index, '')

        if sampled_word == '<eos>' or sampled_word == '':
            break

        translation += ' ' + sampled_word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return translation.strip()

In [None]:
# trying on some sentences

In [61]:
for i in range(3):
    input_seq = input_tensor[i:i+1]
    print("English:", en_texts[i])
    print("Translated to French:", translate_sequence(input_seq))

English: the former was divided between admiration of the brilliancy which exercise had given to her complexion, and doubt as to the occasion's justifying her coming so far alone. the latter was thinking only of his breakfast.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 696ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 597ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 