In [1]:
import numpy as np
import re
import random
# Importing  translations
data_path = "/content/drive/MyDrive/Colab Notebooks/rus.txt"

# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

In [2]:
# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()


In [3]:
# Adjust the number of lines so that
# preprocessing doesn't take too long
for line in lines[:5000]:
  # Input and target sentences are separated by tabs
  input_doc, target_doc = line.split('\t')[:2]
  # Appending each input sentence to input_docs
  input_docs.append(input_doc)

  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
  # Redefine target_doc below
  # and append it to target_docs:
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)

  # Now we split up each sentence into words
  # and add each unique word to our vocabulary set
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    # print(token)
    if token not in input_tokens:
      input_tokens.add(token)
  for token in target_doc.split():
    # print(token)
    if token not in target_tokens:
      target_tokens.add(token)

In [4]:
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])


In [5]:
print('Number of samples:', len(input_docs))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 5000
Number of unique input tokens: 1228
Number of unique output tokens: 3069
Max sequence length for inputs: 6
Max sequence length for outputs: 15


In [6]:
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())


In [7]:
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
print("\nHere's the first item in the encoder input matrix:\n", encoder_input_data[0])
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

#The number of columns matches the number of unique input tokens(1978)
#The number of rows matches the maximum sequence length for input sentences(6)
# num_columns = encoder_input_data.shape[2]
# print(num_columns)



Here's the first item in the encoder input matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
    print("Encoder input timestep & token:", timestep, token)
    print(input_features_dict[token])
    # Assign 1. for the current line, timestep, & word
    # in encoder_input_data:
    encoder_input_data[line, timestep, input_features_dict[token]] = 1.

  for timestep, token in enumerate(target_doc.split()):
    print("Decoder input timestep & token:", timestep, token)
    decoder_input_data[line, timestep, target_features_dict[token]] = 1.
    if timestep > 0:
      print("Decoder target timestep:", timestep)

      decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5
Decoder input timestep & token: 0 <START>
Decoder input timestep & token: 1 Я
Decoder target timestep: 1
Decoder input timestep & token: 2 буду
Decoder target timestep: 2
Decoder input timestep & token: 3 хорошо
Decoder target timestep: 3
Decoder input timestep & token: 4 себя
Decoder target timestep: 4
Decoder input timestep & token: 5 вести
Decoder target timestep: 5
Decoder input timestep & token: 6 .
Decoder target timestep: 6
Decoder input timestep & token: 7 <END>
Decoder target timestep: 7
Encoder input timestep & token: 0 I'll
148
Encoder input timestep & token: 1 buy
415
Encoder input timestep & token: 2 it
715
Encoder input timestep & token: 3 .
5
Decoder input timestep & token: 0 <START>
Decoder input timestep & token: 1 Я
Decoder target timestep: 1
Decoder input timestep & token: 2 куплю
Decoder target timestep: 2
Decoder input timestep & token: 3 это
Decoder target timestep: 3
Decoder input timestep & token

In [9]:
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

In [16]:
latent_dim = 2048
# Choose a batch size
# and a larger number of epochs: it should be more than 100
batch_size = 32
epochs = 30

# Encoder training setup
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

In [17]:
# Decoder training setup:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [18]:
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)

Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 157ms/step - accuracy: 0.0776 - loss: 1.8337 - val_accuracy: 0.1079 - val_loss: 1.4839
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 145ms/step - accuracy: 0.0832 - loss: 1.2143 - val_accuracy: 0.1103 - val_loss: 1.4261
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 151ms/step - accuracy: 0.0911 - loss: 1.1779 - val_accuracy: 0.1303 - val_loss: 1.3539
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 143ms/step - accuracy: 0.0936 - loss: 1.1497 - val_accuracy: 0.1347 - val_loss: 1.3308
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 145ms/step - accuracy: 0.0946 - loss: 1.1070 - val_accuracy: 0.0990 - val_loss: 1.4178
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 143ms/step - accuracy: 0.0945 - loss: 1.0958 - val_accuracy: 0.0923 - val_loss: 1.4026
Epoch 7/30

<keras.src.callbacks.history.History at 0x7e3a101816d0>

In [21]:
training_model.save('training_model.h5')



In [23]:
from keras.models import load_model

In [24]:
training_model = load_model('training_model.h5')



In [25]:
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]

In [26]:
encoder_model = Model(encoder_inputs, encoder_states)

In [27]:
latent_dim = 2048

decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [28]:
def decode_sequence(test_input):
  # Encode the input as state vectors.
  states_value = encoder_model.predict(test_input)

  # Generate empty target sequence of length 1.
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  # Populate the first token of target sequence with the start token.
  target_seq[0, 0, target_features_dict['<START>']] = 1.

  # Sampling loop for a batch of sequences
  # (to simplify, here we assume a batch of size 1).
  decoded_sentence = ''

  stop_condition = False
  while not stop_condition:
    # Run the decoder model to get possible
    # output tokens (with probabilities) & states
    output_tokens, hidden_state, cell_state = decoder_model.predict(
      [target_seq] + states_value)

    # Choose token with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token

    # Exit condition: either hit max length
    # or find stop token.
    if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True

    # Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.

    # Update states
    states_value = [hidden_state, cell_state]

  return decoded_sentence


In [31]:
import random
for _ in range(10):
  seq_index = random.choice(range(len(encoder_input_data)))
  test_input = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(test_input)
  print('-')
  print('Input sentence:', input_docs[seq_index])
  print('Decoded sentence:', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
-
Input sentence: Fold it.
Decoded sentence:  Забудьте её . <END>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
-
Input sentence: I'm not shy.
Decoded sentence:  Я была машину <END>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step


In [36]:
def translate(text):
  tokens = re.findall(r"[\w']+|[^\s\w]", text)
  input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
  for timestep, token in enumerate(tokens):
    if token in input_features_dict:
      input_seq[0, timestep, input_features_dict[token]] = 1.
  print(text)
  print(decode_sequence(input_seq))

In [38]:
translate("tom knows what we need")
translate("how is it going")
translate("how are you")
translate("it was nice seeing you")
translate("till next time")
translate("talk to you later")
translate("i gotta get going")
translate("i believe we haven t met before")
translate("go")

tom knows what we need
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
 Как на ! <END>
how is it going
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
 Как здесь ! <END>
how are you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m