In [None]:
from google.colab import drive
drive.mount('gDrive')

Mounted at gDrive


In [None]:
%cd /content/gDrive/MyDrive/ColabFiles/mldlproject/

/content/gDrive/MyDrive/ColabFiles/mldlproject


In [None]:
%ls

corpus.en_ru.1m.en  news-commentary-v12.ru-en.en
corpus.en_ru.1m.ru  news-commentary-v12.ru-en.ru


# Main parameters for training GRU

In [None]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


vocab_size = 10000
DATA_SET_LENGTH_LIMIT = 100000
embedding_dim = 200 # maybe 200 is the best
data_set = ['corpus.en_ru.1m.', 'news-commentary-v12.ru-en.']
DATA_SET = data_set[0]
max_length = 40
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
batch_size = 256
rnn_units = 256

# Loading dataset

In [None]:
with open(DATA_SET+'en') as f:
  eng_sentences = f.readlines()[:DATA_SET_LENGTH_LIMIT]

with open(DATA_SET+'ru') as f:
  ru_sentences = f.readlines()[:DATA_SET_LENGTH_LIMIT]

# Tokenizing sentences

In [None]:
english_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
english_tokenizer.fit_on_texts(texts=eng_sentences)
english_word_index = english_tokenizer.word_index
print(len(english_word_index.items()))


english_sequences = english_tokenizer.texts_to_sequences(eng_sentences)
english_sequences = pad_sequences(english_sequences, padding=padding_type,truncating=trunc_type, maxlen=max_length)


69127


In [None]:
russian_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
russian_tokenizer.fit_on_texts(ru_sentences)
russian_word_index = russian_tokenizer.word_index
print(len(russian_word_index.items()))


russian_sequences = russian_tokenizer.texts_to_sequences(ru_sentences)
russian_sequences = pad_sequences(russian_sequences, padding=padding_type,truncating=trunc_type, maxlen=max_length)

163575


# Getting twitter embeddings

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip ./glove.6b.zip

#!unzip ./glove.6B.zip

--2021-02-21 16:52:46--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-02-21 16:52:47--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-02-21 16:52:47--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(f'./glove.6B.200d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


# create a weight matrix for words in training docs
vocab_size = len(english_word_index)

embedding_matrix = np.zeros((vocab_size+1, embedding_dim))
print(len(english_tokenizer.word_index.items()))

for word, i in english_tokenizer.word_index.items():
    if i > vocab_size:
      continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:

        embedding_matrix[i] = embedding_vector
       # print(i)

Loaded 400000 word vectors.
69127


# Create basic GRU model

In [None]:
def create_model(vocab_size_en, vocab_size_ru, embedding_dim, rnn_units):
  tf.keras.backend.clear_session()
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size_en+1, embedding_dim, input_length=max_length,weights=[embedding_matrix],trainable=False),

      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_units,return_sequences=True,dropout=0.13)),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_units,return_sequences=True,dropout=0.1)),
      tf.keras.layers.GlobalMaxPool1D(),
      # tf.keras.layers.Conv1D(1,5),
      # tf.keras.layers.MaxPool1D(2),
      # tf.keras.layers.Flatten(),
      tf.keras.layers.Dropout(0.1),
      # tf.keras.layers.Dense(64, activation='relu'),
      
      #tf.keras.layers.Conv1D(5,3,activation='relu'),
    # tf.keras.layers.Dropout(0.05),
      tf.keras.layers.Dense(vocab_size_ru, activation='softmax')
  ])

  return model

model = create_model(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size_en=len(english_word_index),
    vocab_size_ru=len(russian_word_index),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['acc'])


# Training GRU model

In [None]:
Epochs = 10
for epoch in range(Epochs):
  epoch_en_seqs = english_sequences.copy()
  targets = np.ones(shape=(english_sequences.shape[0]))
  for index in range(english_sequences.shape[0]):
    import random
    ind = random.randint(1,max_length)
    train_seq, target_seq = epoch_en_seqs[index,:].copy(), russian_sequences[index]
    train_seq[train_seq > ind] =0
    epoch_en_seqs[index] = train_seq
    targets[index] = target_seq[-1]
  targets = targets.reshape((-1, 1))
  print(epoch_en_seqs.shape, targets.shape)
  model.fit(x=epoch_en_seqs,y=targets, validation_split=0.2,epochs=1)
  



# try on your words

In [91]:
def translate(model, english_tokenizer, russian_tokenizer, sentence):
  if isinstance(sentence, str):
    sentence = [sentence]
    test_english_sequences = english_tokenizer.texts_to_sequences(sentence)
    test_english_sequences = pad_sequences(test_english_sequences, padding=padding_type,truncating=trunc_type, maxlen=max_length)
    answer = []
    for seq in test_english_sequences:
      q = []
      for ind in range(1, max_length):
        tseq = seq.copy()
        tseq[tseq > ind] = 0
        ans = model.predict(tseq)
        q.append(np.argmax(ans))
      answer.append(q)
      
    result= russian_tokenizer.sequences_to_texts(answer)
    return result
translate(
    model,
    english_tokenizer,
    russian_tokenizer,
    sentence='hi i like football'
)


['привет', '<OOV>', 'я', '<OOV>', 'люблю', 'футбол', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>', '<OOV>']
