In [None]:
input = [] #English
output = [] #Russian

with open('./english-russian.txt', encoding='utf8') as f:
    for line in f:
        sentences = line.split('\t')[:2]
        input.append(sentences[1])
        output.append(sentences[0])

In [None]:
import string

#remove punctuations
input = [''.join(word for word in sentence if word not in string.punctuation) for sentence in input]
output = [''.join(word for word in sentence if word not in string.punctuation) for sentence in output]

#need to add SOS EOS token

In [None]:
from collections import Counter
from itertools import chain
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

class Vocab:
  def __init__(self, li): 
      #li = list of sentencs
        #self.language = language #eng OR rus
        self.li = li
        self.tokenizer = Tokenizer()
        self.listOfVocab = self.get_vocab()
        self.word2count = self.get_word2count()
        self.word2index = self.get_word2index()
        self.max_length = max([len(sentence.split()) for sentence in self.li])
        self.encoded = self.get_encoded()

  def get_vocab(self): 
    temp = []
    for i in self.li:
      temp.append(i.split())
    return list(chain(*temp)) #list of vocab

  def get_word2count(self):
    return Counter(self.listOfVocab)

  def get_word2index(self):
    self.tokenizer.fit_on_texts(self.li)
    return self.tokenizer.word_index

  def get_encoded(self):
    sequence = self.tokenizer.texts_to_sequences(self.li)
    sequence = pad_sequences(sequence, self.max_length, padding='post')
    return sequence

In [None]:
eng_vocab = Vocab(output)
rus_vocab = Vocab(input)

In [None]:
from sklearn.model_selection import train_test_split

X, y = input, output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module): # shape (batch_size, maxlength)
    def __init__(self, input_size, hidden_size, embedding_size, num_layers)):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, 0.5)
        self.hidden = hidden_size

    def forward(self, x):
        x = self.embedding(x)
        output, (h_n, c_n) = self.rnn(x)
        return output, (h_n, c_n)

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, 0.5)
        self.hidden = hidden_size

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder, decoder, hidden_size, num_class):
        super(Seq2SeqModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.fc = nn.Linear(hidden_size, num_class)

In [None]:
n_decoder_tokens = len(rus_vocab.word2count)

def batch_generator(X, y, batch_size):
  while True:
    for i in range(len(X)//batch_size):
      encoder_input = np.zeros(batch_size, eng_vocab.max_length)
      decoder_input = np.zeros(batch_size, rus_vocab.max_length)
      target = np.zeros(batch_size, rus_vocab.max_length, n_decoder_tokens)
      for j in range(batch_size):
        index = i * batch_size + j
        if index >= len(X):
          break
        input_text = X[index]
        target_text = y[index]
        for k, word in enumerate(input_text.split()):
          encoder_input[j, k] = eng_vocab.word2index[word]
        for k, word in enumerate(target_text.split()):
          if k < len(target_text.split()) - 1:
            decoder_input[j, k] = rus_vocab.word2index[word]
          if k > 0:
            target[j, k-1, rus_vocab.word2index[word]] = 1

      yield ([encoder_input, decoder_input], target)