<a href="https://colab.research.google.com/github/var-github/ACM_ML/blob/main/Language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import kagglehub
path = kagglehub.dataset_download('devicharith/language-translation-englishfrench')
print('Data source import complete')

Data source import complete


In [37]:
!pip install --upgrade tensorflow



In [38]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM,Dense,Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
from string import punctuation
from google.colab import files

In [39]:
path = path + "/eng_-french.csv"
dataset = pd.read_csv(path)
dataset = dataset[:70000]
dataset = dataset.sample(frac=1)
# Dataset has 2 parts English words/sentences and French words/sentences

In [40]:
def clean_string(s):
    s = s.lower()
    for p in punctuation + "0123456789«":
        s = s.replace(p, "")
    s = s.strip()
    return s

dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean_string(x))
dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean_string(x))

In [41]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

src_tokenizer = create_tokenizer(dataset.loc[:, "English words/sentences"])
tar_tokenizer = create_tokenizer(dataset.loc[:, "French words/sentences"])

with open('src_tokenizer.json', 'w') as f:
    f.write(src_tokenizer.to_json())
with open('tar_tokenizer.json', 'w') as f:
    f.write(tar_tokenizer.to_json())

In [42]:
with open('src_tokenizer.json') as f:
    src_tokenizer = tokenizer_from_json(f.read())
with open('tar_tokenizer.json') as f:
    tar_tokenizer = tokenizer_from_json(f.read())

src_vocab_size = len(src_tokenizer.word_index) + 1
tar_vocab_size = len(tar_tokenizer.word_index) + 1

In [None]:
def max_len(lines):
    return max(len(line.split()) for line in lines)

In [None]:
tar_length = max_len(dataset.loc[:, "French words/sentences"])
src_length = max_len(dataset.loc[:, "English words/sentences"])

In [None]:
test_propotion = 0.15
train_size = len(dataset) - int(len(dataset) * test_propotion)
train_set = dataset[:train_size]
test_set = dataset[train_size:]

In [None]:
x_train = src_tokenizer.texts_to_sequences(train_set.loc[:, "English words/sentences"])
x_train = pad_sequences(x_train, padding='post', maxlen=src_length)
x_test = src_tokenizer.texts_to_sequences(test_set.loc[:, "English words/sentences"])
x_test = pad_sequences(x_test, padding='post', maxlen=src_length)

y_train = tar_tokenizer.texts_to_sequences(train_set.loc[:, "French words/sentences"])
y_train = pad_sequences(y_train, padding='post', maxlen=tar_length)
y_test = tar_tokenizer.texts_to_sequences(test_set.loc[:, "French words/sentences"])
y_test = pad_sequences(y_test, padding='post', maxlen=tar_length)

In [None]:
def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
y_train = encode_output(y_train, tar_vocab_size)
y_test = encode_output(y_test, tar_vocab_size)

In [None]:
def create_model(src_vocab, tar_vocab, tar_length, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_length))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

model = create_model(src_vocab_size, tar_vocab_size, tar_length, 128)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])
model.build(input_shape=(None, src_length))
model.save('language_transform.keras')

In [43]:
model = load_model('language_transform.keras')

In [None]:
checkpoint = ModelCheckpoint('model{epoch:03d}.keras', save_freq='epoch')
model.fit(x_train, y_train, epochs=2, batch_size=64, verbose=1, callbacks=[checkpoint])
model.save('language_transform.keras')
files.download('language_transform.keras')

Epoch 1/2
[1m930/930[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1851s[0m 2s/step - Accuracy: 0.8434 - loss: 0.6726
Epoch 2/2
[1m930/930[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1848s[0m 2s/step - Accuracy: 0.8494 - loss: 0.6321


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [45]:
def predict_seq(model, tokenizer, source):
    source = source.reshape((1, source.shape[0]))
    prediction = model.predict(source)
    integers = [int(np.argmax(vector)) for vector in prediction[0]]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    #Return Decoded Sentence
    return ' '.join(target)

In [49]:
print('\n\n### Results ###')
limit = 20
config = model.get_config()
max_no_of_words = int(config["layers"][0]["config"]["batch_shape"][1])

test_set = dataset.sample(n=limit)
true_src = src_tokenizer.texts_to_sequences(test_set["English words/sentences"])
source = pad_sequences(true_src, padding='post', maxlen=max_no_of_words)

for i in range(len(test_set)):
    true_target = test_set.iloc[i]["French words/sentences"]
    translation = predict_seq(model, tar_tokenizer, source[i])
    print(f'{test_set.iloc[i]["English words/sentences"]:30} {true_target:50} {translation}')



### Results ###
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
im at home                     je suis dans la maison                             je suis chez moi maison
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
you have a good lawyer         vous avez une bonne avocate                        tu avez un serai serai
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
thats a relief                 cest un soulagement                                cest un poche
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
i went too                     jy suis allé aussi                                 je suis taider
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
i hid behind the tree          je me cachai derrière larbre                       jai déteste à à
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
i forgot my pencil             jai oublié mon cr