In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/english-france-dictionary/_about.txt
/kaggle/input/english-france-dictionary/fra.txt


## Import LIB

In [3]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, GRU

## Load & prepossessing Data

In [4]:
input_texts=[]
target_texts=[]
datapath='/kaggle/input/english-france-dictionary/fra.txt'

with open (datapath,"r",encoding="Utf_8")as f:
    lines =f.read().split("\n")

In [5]:
for line in lines[:min(100000, len(lines) - 1)]:
    try:
        input_text, target_text, _ = line.split('\t')
        target_text = "start" + target_text + "end"

        input_texts.append(input_text)
        target_texts.append(target_text)

    except ValueError:
        print(f"error in {line}")
        continue


In [6]:
input_tokenizer = Tokenizer(filters="", lower=True, oov_token="<OOV>")
target_tokenizer = Tokenizer(filters="", lower=True, oov_token="<OOV>")

#fit on text
input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

# conert text to sequense
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

#get vocablary sizes
num_encoder_tokens = len(input_tokenizer.word_index) +1      # for padding
num_decoder_tokens = len(target_tokenizer.word_index) +1

In [7]:
print ( 'I Vocabulary Size (Input):',num_encoder_tokens)
print ( 'I Vocabulary Size (Target):',num_decoder_tokens)

I Vocabulary Size (Input): 14601
I Vocabulary Size (Target): 28669


In [8]:
#input_tokenizer

In [9]:
#input_tokenizer.word_index


In [10]:
#input_sequences

In [11]:
# max length
max_encoder_seq_length = max(len(seq) for seq in input_sequences)
max_decoder_seq_length = max(len(seq) for seq in target_sequences)

#paddeing
encoder_input_data = pad_sequences(input_sequences, maxlen = max_encoder_seq_length, padding="post")
decoder_input_data = pad_sequences(target_sequences, maxlen = max_decoder_seq_length, padding="post")

decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:,:-1] = decoder_input_data[:,1:]
decoder_target_data[:,-1] = 0

print ("max Sequense Length (input):" ,max_encoder_seq_length)
print ("max Sequense Length (Target):" ,max_decoder_seq_length)

max Sequense Length (input): 8
max Sequense Length (Target): 14


In [12]:
encoder_input_data

array([[ 117,    0,    0, ...,    0,    0,    0],
       [ 117,    0,    0, ...,    0,    0,    0],
       [ 117,    0,    0, ...,    0,    0,    0],
       ...,
       [  10,    3,  541, ...,  136,  155,    0],
       [  10,    3,  541, ..., 9769, 5076,    0],
       [  10,    3,  541, ..., 9769, 5076,    0]], dtype=int32)

## Model

In [13]:
latent_dim = 128 # num of neron
embedding_dim = 100 

#encoder
encoder_inputs = Input(shape=(max_encoder_seq_length,))
encoder_embeding = Embedding(num_encoder_tokens, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeding)
encoder_states= [state_h, state_c]

#Decoder
Decoder_inputs = Input(shape=(max_decoder_seq_length,))
Decoder_embeding = Embedding(num_decoder_tokens, embedding_dim, mask_zero=True)(Decoder_inputs)
Decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
Decoder_outputs, _, _ = Decoder_lstm(Decoder_embeding, initial_state = encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation = "softmax")
Decoder_outputs = decoder_dense(Decoder_outputs)

model = Model([encoder_inputs, Decoder_inputs],Decoder_outputs)
model.compile(optimizer="adam", loss = "sparse_categorical_crossentropy")

In [14]:
model.summary()

In [15]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=16,
          epochs=50,
          validation_split=0.2,
          callbacks=[early_stopping])


Epoch 1/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 18ms/step - loss: 5.5155 - val_loss: 4.3294
Epoch 2/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 3.2516 - val_loss: 3.7567
Epoch 3/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 2.3160 - val_loss: 3.5488
Epoch 4/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 1.7100 - val_loss: 3.4867
Epoch 5/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 1.3039 - val_loss: 3.4827
Epoch 6/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 1.0206 - val_loss: 3.5484
Epoch 7/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 0.8255 - val_loss: 3.5747
Epoch 8/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 19ms/step - loss: 0.6802 - val_loss: 3.6435
Epoch 9/

<keras.src.callbacks.history.History at 0x7afcc2b3d7b0>

In [None]:
model.summary()