## Import

In [7]:
import re
import pandas as pd
import numpy as np
import joblib
import os


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Preprocessing

### Read file

In [8]:
def read_file(filename):

  """Reads the contents of a file.



  Args:

    filename: The path to the file.



  Returns:

    The contents of the file as a string.

  """

  try:

    with open(filename, 'r', encoding='UTF-8') as f:

      contents = f.read()

    return contents

  except FileNotFoundError:

    print(f"File not found: {filename}")

    return None


In [9]:
file_contents = read_file('/kaggle/input/fra-eng/fra.txt')


### Remove unnecessary text

In [10]:
# remove any sentence starts with "CC-BY"

new_text = re.sub(r'CC-BY.+', '', file_contents)

len(new_text.split('\n'))

232737

In [11]:
# remove any comma in our data to properly create a csv file

# Relax.  Relaxe, Max !,

# Relax.  Cool, Raoul !,

new_text = re.sub(r'[,\.!?»«"]', '', new_text)

len(new_text.split('\n'))

232737

In [12]:
# separate the source and target sentence with comma to convert it to csv file

new_text = re.sub(r'\t', ',', new_text.lower())

len(new_text.split('\n'))

232737

### Save file

In [13]:
os.makedirs("artifacts", exist_ok=True)

In [14]:
with open('artifacts/fra_new.csv', 'w') as f:

  f.write(new_text)

### Load the csv file

In [15]:
df = pd.read_csv('artifacts/fra_new.csv', names=['target', 'source', '0'])

df.drop('0', axis=1, inplace=True)

df.head()

Unnamed: 0,target,source
0,go,va
1,go,marche
2,go,en route
3,go,bouge
4,hi,salut


### Source language preprocessing

#### Tokenization

In [16]:
# Sample data

source_sentences = df['source']

# Create a Tokenizer instance

tokenizer_src = Tokenizer(num_words=30000, oov_token='<OOV>')



# Fit the tokenizer on the texts

tokenizer_src.fit_on_texts(source_sentences)



# Convert texts to sequences (lists of integers)

sequences_src = tokenizer_src.texts_to_sequences(source_sentences)

In [17]:
# Print word index (vocabulary)

word2idx_src = tokenizer_src.word_index

word2idx_src["<PAD>"] = 0

idx2word_src = {v: k for k, v in word2idx_src.items()}

# print(word2idx_src)

#### Padding

In [18]:
# some configurations

MAX_SEQUENCE_LENGTH_SRC = max([len(seq) for seq in sequences_src])

In [19]:
MAX_SEQUENCE_LENGTH_SRC

58

In [20]:
# Pad source sequences

padded_source_sequence = pad_sequences(sequences_src, maxlen=MAX_SEQUENCE_LENGTH_SRC, padding='post', value=0)

In [21]:
padded_source_sequence

array([[  98,    0,    0, ...,    0,    0,    0],
       [ 811,    0,    0, ...,    0,    0,    0],
       [  20,  619,    0, ...,    0,    0,    0],
       ...,
       [  44,  157,   34, ...,    0,    0,    0],
       [  13,   15,   90, ...,    0,    0,    0],
       [ 762,   23,  914, ...,   75,    8, 7446]], dtype=int32)

## Target language preprocessing

### Add Special Tokens `<start>` and `<end>`

> Example:



* Target input: [`<start>`, comment, ça, va, ?]



* Target output: [comment, ça, va, ?, `<end>`]

In [33]:
target_sentences = df['target']

target_sentences_st_end = target_sentences.apply(lambda x: '<START> ' + x + ' <END>')

input_target = target_sentences.apply(lambda x: '<START> ' + x )

output_target = target_sentences.apply(lambda x:   x + ' <END>')

In [34]:
input_target

0                                                <START> go
1                                                <START> go
2                                                <START> go
3                                                <START> go
4                                                <START> hi
                                ...                        
232731    <START> death is something that we're often di...
232732    <START> since there are usually multiple websi...
232733    <START> if someone who doesn't know your backg...
232734    <START> it may be impossible to get a complete...
232735    <START> i went drinking with one of my boyfrie...
Name: target, Length: 232736, dtype: object

In [35]:
output_target

0                                                  go <END>
1                                                  go <END>
2                                                  go <END>
3                                                  go <END>
4                                                  hi <END>
                                ...                        
232731    death is something that we're often discourage...
232732    since there are usually multiple websites on a...
232733    if someone who doesn't know your background sa...
232734    it may be impossible to get a completely error...
232735    i went drinking with one of my boyfriend's fri...
Name: target, Length: 232736, dtype: object

In [36]:
target_sentences_st_end

0                                          <START> go <END>
1                                          <START> go <END>
2                                          <START> go <END>
3                                          <START> go <END>
4                                          <START> hi <END>
                                ...                        
232731    <START> death is something that we're often di...
232732    <START> since there are usually multiple websi...
232733    <START> if someone who doesn't know your backg...
232734    <START> it may be impossible to get a complete...
232735    <START> i went drinking with one of my boyfrie...
Name: target, Length: 232736, dtype: object

### Tokenization


In [44]:
# Create a Tokenizer instance

tokenizer_trg = Tokenizer(num_words=30000, oov_token='<OOV>', filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', lower=False)



# Fit the tokenizer on the texts

tokenizer_trg.fit_on_texts(target_sentences_st_end)



# Convert texts to sequences (lists of integers)

input_target_sequence = tokenizer_trg.texts_to_sequences(input_target)

output_target_sequence = tokenizer_trg.texts_to_sequences(output_target)


In [46]:
# input_target_sequence

In [48]:
# output_target_sequence

In [49]:
word2idx_trg = tokenizer_trg.word_index

word2idx_trg["<PAD>"] = 0

idx2word_trg = {v: k for k, v in word2idx_trg.items()}

# print(word2idx_trg)

In [51]:
# word2idx_trg

### Padding

In [61]:
# some configurations

MAX_SEQUENCE_LENGTH_IN_TRG = max([len(seq) for seq in input_target_sequence])

In [62]:
MAX_SEQUENCE_LENGTH_IN_TRG

56

In [63]:
input_target_padded = pad_sequences(input_target_sequence, maxlen=MAX_SEQUENCE_LENGTH_IN_TRG , padding='post', value=0)
output_target_padded = pad_sequences(output_target_sequence, maxlen=MAX_SEQUENCE_LENGTH_IN_TRG , padding='post', value=0)


In [64]:
input_target_padded

array([[  2,  45,   0, ...,   0,   0,   0],
       [  2,  45,   0, ...,   0,   0,   0],
       [  2,  45,   0, ...,   0,   0,   0],
       ...,
       [  2,  70, 291, ...,   0,   0,   0],
       [  2,  14, 184, ...,   0,   0,   0],
       [  2,   4, 158, ...,  35,  55, 149]], dtype=int32)

In [65]:
output_target_padded

array([[ 45,   3,   0, ...,   0,   0,   0],
       [ 45,   3,   0, ...,   0,   0,   0],
       [ 45,   3,   0, ...,   0,   0,   0],
       ...,
       [ 70, 291,  86, ...,   0,   0,   0],
       [ 14, 184,  29, ...,   0,   0,   0],
       [  4, 158, 796, ...,  55, 149,   3]], dtype=int32)

## Save word2idx_src & idx2word_trg

In [66]:
# joblib.dump(word2idx_src, 'artifacts/word2idx_src.pkl')

# joblib.dump(idx2word_trg, 'artifacts/idx2word_trg.pkl')

## Model

In [67]:
# Config

EMBED_SIZE = 256   # Dimension of the word embeddings

NUM_ENCODER_TOKENS = len(word2idx_src)  # Vocabulary size for the source language

NUM_DECODER_TOKENS = len(word2idx_trg)  # Vocabulary size for the target language


In [68]:
NUM_ENCODER_TOKENS, NUM_DECODER_TOKENS

(35615, 16791)

In [73]:
# Encoder (French input)
encoder_input = Input(shape=(MAX_SEQUENCE_LENGTH_SRC,))
encoder_embedding = Embedding(NUM_ENCODER_TOKENS, EMBED_SIZE)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)

In [75]:
# Decoder (English output)
decoder_input = Input(shape=(MAX_SEQUENCE_LENGTH_IN_TRG ,))
decoder_embedding = Embedding(NUM_DECODER_TOKENS, EMBED_SIZE)(decoder_input)
decoder_lstm = LSTM(256, return_sequences=True)(decoder_embedding, initial_state=[state_h, state_c])

# TimeDistributed Dense layer for English sequence output
decoder_output = TimeDistributed(Dense(NUM_DECODER_TOKENS, activation='softmax'))(decoder_lstm)

In [76]:
# Define the model
model = Model([encoder_input, decoder_input], decoder_output)

In [77]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# model = joblib.load("/kaggle/working/best_model.keras")

In [79]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1)

model_checkpoint = ModelCheckpoint('artifacts/best_model.keras', monitor='val_loss', save_best_only=True, verbose=1)

model.fit([padded_source_sequence, input_target_padded], output_target_padded,

          batch_size=64,

          epochs=30,

          validation_split=0.2,

          callbacks=[early_stopping, model_checkpoint])


Epoch 1/30
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.9174 - loss: 0.5206
Epoch 1: val_loss improved from inf to 0.86318, saving model to artifacts/best_model.keras
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 135ms/step - accuracy: 0.9174 - loss: 0.5206 - val_accuracy: 0.8646 - val_loss: 0.8632
Epoch 2/30
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.9415 - loss: 0.3249
Epoch 2: val_loss improved from 0.86318 to 0.74736, saving model to artifacts/best_model.keras
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 134ms/step - accuracy: 0.9415 - loss: 0.3248 - val_accuracy: 0.8798 - val_loss: 0.7474
Epoch 3/30
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.9565 - loss: 0.2202
Epoch 3: val_loss improved from 0.74736 to 0.69767, saving model to artifacts/best_model.keras
[1m2910/2910[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79471433cf40>

In [80]:
joblib.dump(tokenizer_src, 'artifacts/tokenizer_src.pkl')
joblib.dump(tokenizer_trg, 'artifacts/tokenizer_trg.pkl')

['artifacts/tokenizer_trg.pkl']

In [86]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_translation(model, input_sequence, tokenizer_src, tokenizer_trg, max_length_src, max_length_trg):
    # Tokenize and pad the input sequence
    input_seq = tokenizer_src.texts_to_sequences([input_sequence])
    input_seq = pad_sequences(input_seq, maxlen=max_length_src, padding='post')
    
    # Initialize target sequence with the start token
    target_seq = np.zeros((1, max_length_trg))
    target_seq[0, 0] = tokenizer_trg.word_index['<START>']
    
    # Generate the translation
    output_sequence = []
    for i in range(max_length_trg - 1):
        output_tokens = model.predict([input_seq, target_seq])
        sampled_token_index = np.argmax(output_tokens[0, i, :])
        sampled_token = tokenizer_trg.index_word.get(sampled_token_index, '<UKN>')
        
        if sampled_token == '<END>':
            break
        
        output_sequence.append(sampled_token)
        
        # Update target sequence for next token prediction
        target_seq[0, i+1] = sampled_token_index
    
    return ' '.join(output_sequence)

# Example usage
input_text = "Quel est ton nom" # French input
translated_text = predict_translation(model, input_text, tokenizer_src, tokenizer_trg, MAX_SEQUENCE_LENGTH_SRC, MAX_SEQUENCE_LENGTH_IN_TRG)
print(f"Input (French): {input_text}")
print(f"Translated (English): {translated_text}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Input (French): Quel est ton nom
Translated (English): what's your name
