<a href="https://colab.research.google.com/github/var-github/ACM_ML/blob/main/Language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
path = kagglehub.dataset_download('devicharith/language-translation-englishfrench')
print('Data source import complete')

Data source import complete


In [None]:
!pip install --upgrade tensorflow



In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
import pandas as pd
from string import punctuation

In [None]:
path = path + "/eng_-french.csv"
dataset = pd.read_csv(path, nrows=10000)
# Dataset has 2 parts English words/sentences and French words/sentences

In [None]:
def clean_string(s):
    s = s.lower()
    for p in punctuation + "0123456789«":
        s = s.replace(p, "")
    s = s.strip()
    return s

dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean_string(x))
dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean_string(x))

In [None]:
test_propotion = 0.15
train_size = len(dataset) - int(len(dataset) * test_propotion)
train_set = dataset[:train_size]
test_set = dataset[train_size:]

In [None]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
def max_len(lines):
    return max(len(line.split()) for line in lines)

In [None]:
src_tokenizer = create_tokenizer(dataset.loc[:, "English words/sentences"])
tar_tokenizer = create_tokenizer(dataset.loc[:, "French words/sentences"])

tar_length = max_len(dataset.loc[:, "French words/sentences"])
src_length = max_len(dataset.loc[:, "English words/sentences"])

x_train = src_tokenizer.texts_to_sequences(train_set.loc[:, "English words/sentences"])
x_train = pad_sequences(x_train, padding='post', maxlen=src_length)
x_test = src_tokenizer.texts_to_sequences(test_set.loc[:, "English words/sentences"])
x_test = pad_sequences(x_test, padding='post', maxlen=src_length)

y_train = tar_tokenizer.texts_to_sequences(train_set.loc[:, "French words/sentences"])
y_train = pad_sequences(y_train, padding='post', maxlen=tar_length)
y_test = tar_tokenizer.texts_to_sequences(test_set.loc[:, "French words/sentences"])
y_test = pad_sequences(y_test, padding='post', maxlen=tar_length)

In [None]:
def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
tar_vocab_size = len(tar_tokenizer.word_index) + 1
src_vocab_size = len(src_tokenizer.word_index) + 1
y_train = encode_output(y_train, tar_vocab_size)
y_test = encode_output(y_test, tar_vocab_size)

In [None]:
def create_model(src_vocab, tar_vocab, src_length, tar_length, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_length, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_length))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.build(input_shape=(None, src_length))



In [None]:
model.fit(x_train, y_train, epochs=20, batch_size=64, verbose=1)
model.save('language_transform.keras')

Epoch 1/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 327ms/step - loss: 4.5213
Epoch 2/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 322ms/step - loss: 2.1964
Epoch 3/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 324ms/step - loss: 2.0451
Epoch 4/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 318ms/step - loss: 1.9441
Epoch 5/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 324ms/step - loss: 1.8728
Epoch 6/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 321ms/step - loss: 1.8190
Epoch 7/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 317ms/step - loss: 1.7559
Epoch 8/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 322ms/step - loss: 1.6841
Epoch 9/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 316ms/step - loss: 1.6029
Epoch 10/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[