<a href="https://colab.research.google.com/github/wubeshetA/ML/blob/main/school-assignments/language_transalation/French_English_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

In [22]:


file_path = 'french_eng.csv'

try:
    data = pd.read_csv(file_path)
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")

    chunksize = 1000
    for chunk in pd.read_csv(file_path, chunksize=chunksize, error_bad_lines=False):
        print(chunk.head())

In [25]:
data = pd.read_csv(file_path)

data = data.drop(columns=['Unnamed: 0'])

data.head()

Unnamed: 0,FRENCH,ENGLISH
0,Salut!,Hi.
1,Cours !,Run!
2,Courez !,Run!
3,Qui ?,Who?
4,Ça alors !,Wow!


In [26]:
# Separate input (English) and target (Ewe) texts
input_texts = data['ENGLISH'].astype(str).tolist()
target_texts = data['FRENCH'].astype(str).tolist()

# Tokenizer setup
input_tokenizer = tf.keras.preprocessing.text.Tokenizer()
target_tokenizer = tf.keras.preprocessing.text.Tokenizer()

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Pad sequences
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_target_len, padding='post')

# Define the model
def build_model(input_vocab_size, target_vocab_size, embedding_dim=256, units=512):
    encoder_inputs = Input(shape=(None,))
    enc_emb = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,))
    dec_emb_layer = Embedding(target_vocab_size, embedding_dim)
    dec_emb = dec_emb_layer(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

model = build_model(input_vocab_size, target_vocab_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()



Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_8 (Embedding)     (None, None, 256)            3059200   ['input_9[0][0]']             
                                                                                                  
 embedding_9 (Embedding)     (None, None, 256)            6008832   ['input_10[0][0]']            
                                                                                            

In [None]:
# Training
batch_size = 128
epochs = 30

target_sequences_input = target_sequences[:, :-1]
target_sequences_output = target_sequences[:, 1:]
target_sequences_output = np.expand_dims(target_sequences_output, -1)

history = model.fit([input_sequences, target_sequences_input], target_sequences_output,
                    batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Save model
model.save('french_eng_model.h5')

# Function to translate new sentences
def translate(sentence, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len):
    sequence = input_tokenizer.texts_to_sequences([sentence])
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_input_len, padding='post')
    prediction = model.predict([sequence, sequence])
    target_sequence = [np.argmax(word) for word in prediction[0]]
    target_text = target_tokenizer.sequences_to_texts([target_sequence])[0]
    return target_text



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30

## Example

In [35]:
# Example usage
sentence1 = "I am fine"
translation = translate(sentence1, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len)
print(f'Translation: {translation}')


Translation: vais bien


In [36]:
sentence2 = "French is a romantic language"
translation = translate(sentence2, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len)
print(f'Translation: {translation}')

Translation: français français est est


In [41]:
sentence2 = "You are right!"
translation = translate(sentence2, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len)
print(f'Translation: {translation}')

Translation: quoi bonne faits
