<a href="https://colab.research.google.com/github/toshNaik/English-German/blob/master/Hallo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM, Input, Embedding, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import metrics

In [2]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units):
    super(Encoder, self).__init__()
    self.enc_units = enc_units
    self.input_layer = Input(shape=(None, ))
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.lstm = LSTM(self.enc_units, return_state = True)
  
  def call(self, X):
    # X = self.input_layer(X)
    X = self.embedding(X)
    output, state_h, state_c = self.lstm(X)
    states = [state_h, state_c]
    return output, states

In [3]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units):
    super(Decoder, self).__init__()
    self.dec_units = dec_units
    self.input_layer = Input(shape=(None, ))
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.lstm = LSTM(self.dec_units, return_sequences=True, return_state = True)
    self.fc = TimeDistributed(Dense(vocab_size, activation='softmax'))

  def call(self, X, enc_state):
    # X = self.input_layer(X)
    X = self.embedding(X)
    output, state_h, state_c = self.lstm(X, initial_state = enc_state)
    output = self.fc(output)
    states = [state_h, state_c]
    return output, states

In [4]:
class Seq2seq(tf.keras.Model):
  def __init__(self, encoder, decoder):
    super(Seq2seq, self).__init__()
    
    assert encoder.enc_units == decoder.dec_units, "encoder units and decoder units must be same"
    
    self.encoder = encoder
    self.decoder = decoder

  def call(self, X, training=False):
    X_source = X[0]
    X_target = X[1]
    _, encoder_states = self.encoder(X_source)
    decoder_output, _ = self.decoder(X_target, encoder_states)
    return decoder_output
    
  
  def predict(self, X, tokenizer):
    _, init_states = self.encoder(X)
    decoder_init = tokenizer.word_index['bos'] # token of <BOS> will be first input
    decoder_init = np.array([[decoder_init]]) # Input has to be 2d array

    stop = False
    deu_sentence = ''
    
    while not stop:
      output, next_states = self.decoder(decoder_init, init_states)
      new_id = np.argmax(output[0, -1, :])
      new_word = tokenizer.index_word[new_id]
      
      if (new_word == 'eos' or len(deu_sentence.split()) == 77):
        stop = True
      else:
        deu_sentence += new_word + ' '

      dec_word = np.array([[new_id]])
      init_states = next_states
    return deu_sentence

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
!unzip drive/My\ Drive/english-german/deu-eng.zip

Archive:  drive/My Drive/english-german/deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [7]:
file1 = open('deu.txt', 'r')
dataset = []
for line in file1.readlines():
  dataset.append(line)
file1.close()

eng = []
deu = []
for line in dataset:
  english, deutsche, _ = re.split('\t', line)
  english = re.sub(r'[^\w\s]', '', english)   #remove punctuation
  deutsche = re.sub(r'[^\w\s]', '', deutsche)
  english = re.sub(r'\d', '', english)        #remove digits
  deutsche = re.sub(r'\d', '', deutsche)
  eng.append(english.lower())
  deu.append(deutsche.lower()+' <EOS>')

eng_data = pd.Series(eng)
deu_data = pd.Series(deu)
deu_input_data = deu_data.copy().apply(lambda x: '<BOS>' + x)

def tokenize(data):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data)
  return tokenizer

english_tokenizer = tokenize(eng_data.values)
english_vocab_size = len(english_tokenizer.word_index) + 1
print(f'English vocab size {english_vocab_size}')

german_tokenizer = tokenize(deu_input_data.values)
german_vocab_size = len(german_tokenizer.word_index) + 1
print(f'German vocab size {german_vocab_size}')

eng_tokens = english_tokenizer.texts_to_sequences(eng_data.values)
deu_tokens_output = german_tokenizer.texts_to_sequences(deu_data.values)
deu_tokens_input = german_tokenizer.texts_to_sequences(deu_input_data.values)
english_padded = pad_sequences(eng_tokens, padding='post')
print(f'English padded: {english_padded.shape}')
german_padded_input = pad_sequences(deu_tokens_input, padding='post')
print(f'German input padded: {german_padded_input.shape}')
german_padded_output = pad_sequences(deu_tokens_output, padding='post', maxlen = german_padded_input.shape[1])
print(f'German output padded: {german_padded_output.shape}')

german_padded_output = german_padded_output.reshape(german_padded_output.shape[0], german_padded_output.shape[1], 1)
X_train, X_test, y_train, y_test = train_test_split(np.concatenate((english_padded, german_padded_input), axis =1), german_padded_output, test_size = 7000, shuffle=True, random_state=57)

english_inputs = X_train[:, :101]
print(english_inputs.shape)
german_inputs = X_train[:, 101:]
print(german_inputs.shape)
print(y_train.shape)

english_inputs_val = X_test[:, :101]
print(english_inputs_val.shape)
german_inputs_val = X_test[:, 101:]
print(german_inputs_val.shape)
print(y_test.shape)

English vocab size 16187
German vocab size 34990
English padded: (217032, 101)
German input padded: (217032, 77)
German output padded: (217032, 77)
(210032, 101)
(210032, 77)
(210032, 77, 1)
(7000, 101)
(7000, 77)
(7000, 77, 1)


In [9]:
encoder = Encoder(english_vocab_size, 100, 128)
decoder = Decoder(german_vocab_size, 100, 128)
model = Seq2seq(encoder, decoder)
x1 = np.zeros((1, 101))
x2 = np.zeros((1, 77))
y = np.zeros((1, 77, 1))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[metrics.sparse_categorical_accuracy])
model.train_on_batch([x1, x2], y)
model.load_weights('/content/drive/My Drive/english-german/0.125.hdf5')

In [92]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[metrics.sparse_categorical_accuracy])

In [11]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f7b6a7c3470>

In [24]:
sentence = 'How are you'.lower()
eng_pred = pad_sequences(english_tokenizer.texts_to_sequences([sentence]), maxlen=101, padding='post')
predict(eng_pred)

'ich sie du du du du du '

In [21]:
def predict(X):
  _, init_states = encoder(X)
  decoder_init = german_tokenizer.word_index['bos'] # token of <BOS> will be first input
  decoder_init = np.array([[decoder_init]]) # Input has to be 2d array

  stop = False
  deu_sentence = ''
  
  while not stop:
    output, next_states = decoder(decoder_init, init_states)
    new_id = np.argmax(output[0, -1, :])
    new_word = german_tokenizer.index_word[new_id]
    
    if (new_word == 'eos' or len(deu_sentence.split()) == 77):
      stop = True
    else:
      deu_sentence += new_word + ' '

    dec_word = np.array([[new_id]])
    init_states = next_states
  return deu_sentence

In [10]:
path = '/content/drive/My Drive/english-german/{loss:.3f}.hdf5'

In [12]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f7b6a3a1e48>

In [13]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f7b6a3479b0>

In [105]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78fe8e48>

In [106]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a79006860>

In [107]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78fab080>

In [108]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a79028128>

In [109]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78fab588>

In [110]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7904f5c0>

In [111]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7d922d68>

In [112]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78ffbe10>

In [113]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a79039780>

In [114]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78fd70f0>

In [115]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78f79940>

In [116]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a79062198>

In [117]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7900f128>

In [118]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78fa5240>

In [119]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7904fa58>

In [120]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7c0b1e80>

In [121]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a7a8a1748>

In [122]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78f6d160>

In [123]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78f26b00>

In [124]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a79003080>

In [125]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78f79630>

In [126]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



<tensorflow.python.keras.callbacks.History at 0x7f6a78f6dc88>

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])



In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=1,
          callbacks = [ModelCheckpoint(path, monitor='loss', mode='min', save_best_only=True)])

In [None]:
model.save('/content/drive/My Drive/english-german/diff_model.hdf5')

In [None]:
model.save('/content/drive/My Drive/english-german/diff_model.hdf5')