In [25]:
with open('ukr.txt',encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]

# text_pairs = []
ua_sentences = []
eng_sentences = []

for line in lines:
    eng, ua, _ = line.split("\t")
#     ua = "[start] " + ua + " [end]"
    ua_sentences.append(ua)
    eng_sentences.append(eng)
#     text_pairs.append((eng, ua))

In [26]:
import keras
keras.__version__

'2.15.0'

In [27]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, Dropout, LSTM, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [28]:
for sample_i in range(5):
    print('English sample {}:  {}'.format(sample_i + 1, eng_sentences[sample_i]))
    print('Ukrainian sample {}:  {}\n'.format(sample_i + 1, ua_sentences[sample_i]))

English sample 1:  Go.
Ukrainian sample 1:  Йди.

English sample 2:  Hi.
Ukrainian sample 2:  Вітаю!

English sample 3:  Hi.
Ukrainian sample 3:  Привіт.

English sample 4:  Hi.
Ukrainian sample 4:  Привіт!

English sample 5:  Run!
Ukrainian sample 5:  Біжіть!



In [29]:
Ua_tokenizer = Tokenizer()
Ua_tokenizer.fit_on_texts(ua_sentences)
ua_seq=Ua_tokenizer.texts_to_sequences(ua_sentences)
ua_vocab_size = len(Ua_tokenizer.word_index)

Eng_tokenizer = Tokenizer()
Eng_tokenizer.fit_on_texts(eng_sentences)
eng_seq=Eng_tokenizer.texts_to_sequences(eng_sentences)
eng_vocab_size = len(Eng_tokenizer.word_index)

ua_seq = pad_sequences(ua_seq, padding='post',maxlen=41)
eng_seq = pad_sequences(eng_seq, padding='post',maxlen=41 )

print(eng_seq.shape)
print(ua_seq.shape)

(158705, 41)
(158705, 41)


In [30]:
ua_seq

array([[  511,     0,     0, ...,     0,     0,     0],
       [ 3303,     0,     0, ...,     0,     0,     0],
       [ 1075,     0,     0, ...,     0,     0,     0],
       ...,
       [  124, 10622,  7912, ...,     0,     0,     0],
       [ 2636, 17899,     4, ...,     0,     0,     0],
       [    1,   515,     4, ...,     0,     0,     0]], dtype=int32)

In [31]:
eng_seq

array([[  41,    0,    0, ...,    0,    0,    0],
       [1331,    0,    0, ...,    0,    0,    0],
       [1331,    0,    0, ...,    0,    0,    0],
       ...,
       [  87, 5270, 2316, ...,    0,    0,    0],
       [   5, 6159, 1714, ...,    0,    0,    0],
       [  59,  235,    8, ...,  438, 1078,   14]], dtype=int32)

In [32]:
eng_seq.shape[-2]

158705

In [33]:
ua_seq.shape[-2]

158705

In [104]:
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    x=' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
    return x
    # return ''.join(word for word in x.split() if word != '<PAD>')

In [17]:
model = Sequential()
model.add(Embedding(eng_vocab_size, 256,  input_shape=eng_seq.shape[1:]))
model.add(GRU(256, return_sequences=True))
model.add(TimeDistributed(Dense(1024, activation='relu')))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(ua_vocab_size, activation='softmax'))

model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.005),
                  metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 41, 256)           2706688   
                                                                 
 gru (GRU)                   (None, 41, 256)           394752    
                                                                 
 time_distributed (TimeDist  (None, 41, 1024)          263168    
 ributed)                                                        
                                                                 
 dense_1 (Dense)             (None, 41, 1024)          1049600   
                                                                 
 dropout (Dropout)           (None, 41, 1024)          0         
                                                                 
 dense_2 (Dense)             (None, 41, 31345)         32128625  
                                                        

In [20]:
model.fit(eng_seq, ua_seq, batch_size=64, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cce929ba0e0>

In [107]:
import random

random_index = random.sample(range(0, 158705), 15)

pred=model.predict(eng_seq[random_index])

for i in range(15):

  print("\nOriginal text:")
  print(eng_sentences[random_index[i]])

  print("\nPrediction:")
  print(logits_to_text(pred[i], Ua_tokenizer))


  print("\nCorrect Translation:")
  print(ua_sentences[random_index[i]])



Original text:
I feed my dog twice a day.

Prediction:
я запалив руки <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
Я годую свого собаку двічі на день.

Original text:
Where's the station?

Prediction:
де знаходиться <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
Де знаходиться вокзал?

Original text:
Tom might be frustrated.

Prediction:
том може бути <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Co