<a href="https://colab.research.google.com/github/toshNaik/English-German/blob/master/Nein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!unzip drive/My\ Drive/english-german/deu-eng.zip

Archive:  drive/My Drive/english-german/deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [36]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM, Input, Embedding, TimeDistributed
from tensorflow.keras.models import Model

In [37]:
file1 = open('deu.txt', 'r')
dataset = []
for line in file1.readlines():
  dataset.append(line)
file1.close()

eng = []
deu = []
for line in dataset:
  english, deutsche, _ = re.split('\t', line)
  english = re.sub(r'[^\w\s]', '', english)   #remove punctuation
  deutsche = re.sub(r'[^\w\s]', '', deutsche)
  english = re.sub(r'\d', '', english)        #remove digits
  deutsche = re.sub(r'\d', '', deutsche)
  eng.append(english.lower())
  deu.append(deutsche.lower()+' <EOS>')

eng_data = pd.Series(eng)
deu_data = pd.Series(deu)
deu_input_data = deu_data.copy().apply(lambda x: '<BOS>' + x)

def tokenize(data):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data)
  return tokenizer

english_tokenizer = tokenize(eng_data.values)
english_vocab_size = len(english_tokenizer.word_index) + 1
print(f'English vocab size {english_vocab_size}')

german_tokenizer = tokenize(deu_input_data.values)
german_vocab_size = len(german_tokenizer.word_index) + 1
print(f'German vocab size {german_vocab_size}')

eng_tokens = english_tokenizer.texts_to_sequences(eng_data.values)
deu_tokens_output = german_tokenizer.texts_to_sequences(deu_data.values)
deu_tokens_input = german_tokenizer.texts_to_sequences(deu_input_data.values)
english_padded = pad_sequences(eng_tokens, padding='post')
print(f'English padded: {english_padded.shape}')
german_padded_input = pad_sequences(deu_tokens_input, padding='post')
print(f'German input padded: {german_padded_input.shape}')
german_padded_output = pad_sequences(deu_tokens_output, padding='post', maxlen = german_padded_input.shape[1])
print(f'German output padded: {german_padded_output.shape}')

german_padded_output = german_padded_output.reshape(german_padded_output.shape[0], german_padded_output.shape[1], 1)
X_train, X_test, y_train, y_test = train_test_split(np.concatenate((english_padded, german_padded_input), axis =1), german_padded_output, test_size = 7000, shuffle=True, random_state=57)

english_inputs = X_train[:, :101]
print(english_inputs.shape)
german_inputs = X_train[:, 101:]
print(german_inputs.shape)
print(y_train.shape)

english_inputs_val = X_test[:, :101]
print(english_inputs_val.shape)
german_inputs_val = X_test[:, 101:]
print(german_inputs_val.shape)
print(y_test.shape)

English vocab size 16187
German vocab size 34990
English padded: (217032, 101)
German input padded: (217032, 77)
German output padded: (217032, 77)
(210032, 101)
(210032, 77)
(210032, 77, 1)
(7000, 101)
(7000, 77)
(7000, 77, 1)


In [28]:
enc_input = Input(shape=(None,))
enc_emb = Embedding(english_vocab_size, 64)(enc_input)
enc_lstm_layer = LSTM(64, return_state = True, name='encoder')
_, state_h, state_c = enc_lstm_layer(enc_emb)
enc_state = [state_h, state_c]

dec_input = Input(shape=(None,))
dec_emb_layer = Embedding(german_vocab_size, 64)
dec_emb = dec_emb_layer(dec_input)
dec_lstm_layer = LSTM(64, return_sequences=True, return_state = True, name='decoder')
dec_out, _, _ = dec_lstm_layer(dec_emb, initial_state = enc_state)
dec_softmax = TimeDistributed(Dense(german_vocab_size, activation='softmax'))
dec_out = dec_softmax(dec_out)
model = Model([enc_input, dec_input], dec_out)

In [36]:
from tensorflow.keras import metrics
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[metrics.sparse_categorical_accuracy])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
model.fit(x = [english_inputs, german_inputs], y = y_train,
          batch_size=128, epochs=5,
          validation_data=([english_inputs_val, german_inputs_val], y_test),
          callbacks = [ModelCheckpoint('model.h5', monitor='val_loss', mode='min', save_best_only=True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [29]:
# Encoder inference model
enc_model = Model(enc_input, enc_state)

# Decoder inference model
state_h_in = Input(shape = (64,))
state_c_in = Input(shape = (64,))
dec_state_in = [state_h_in, state_c_in]
dec_emb_pred = dec_emb_layer(dec_input)
dec_out_pred, state_h_out, state_c_out = dec_lstm_layer(dec_emb_pred, initial_state = dec_state_in)

dec_out_pred = dec_softmax(dec_out_pred)
dec_model = Model([dec_input] + dec_state_in, [dec_out_pred, state_h_out, state_c_out])

In [40]:
def predict_german(sentence):
  eng_pred = pad_sequences(english_tokenizer.texts_to_sequences([sentence]), maxlen=101, padding='post')
  dec_init = enc_model.predict(eng_pred)
  dec_word = np.array([[1]]) # token for <BOS>

  stop = False
  deu_sentence = ''
  while not stop:
    output = dec_model.predict([dec_word] + dec_init)
    output_seq = output[0]
    new_states = output[1:]
    
    new_id = np.argmax(output_seq[0, -1, :])
    new_word = german_tokenizer.index_word[new_id]

    if (new_word == 'eos' or len(deu_sentence.split()) == 77):
      stop = True
    else:
      deu_sentence += new_word + ' '

    dec_word = np.array([[new_id]])
    dec_init = new_states
  return deu_sentence

In [48]:
print(f"English: How are you         ||| German : {predict_german('How are you'.lower())}")
print(f"English: What is your name   ||| German : {predict_german('What is your name'.lower())}")
print(f"English: Are you hungry      ||| German : {predict_german('Are you hungry'.lower())}")
print(f"English: My name is Ashutosh ||| German : {predict_german('My name is Ashutosh'.lower())}")
print(f"English: It is a dog         ||| German : {predict_german('It is a dog'.lower())}")

English: How are you         ||| German : wie bist du 
English: What is your name   ||| German : wie heißt du 
English: Are you hungry      ||| German : hast du hunger 
English: My name is Ashutosh ||| German : mein name ist 
English: It is a dog         ||| German : es ist eine katze 


In [32]:
# To save a model
loaded.save('/content/drive/My Drive/english-german/latest5.hdf5')

In [39]:
# Load a model from drive
loaded = load_model('/content/drive/My Drive/english-german/latest6.4.hdf5')

# Get layers from loaded model
enc_input = loaded.input[0]
enc_emb = loaded.get_layer(name = 'embedding_2').output
_, state_h, state_c = loaded.get_layer(name = 'encoder').output
enc_state = [state_h, state_c]

dec_input = loaded.input[1]
dec_emb_layer = loaded.get_layer(name='embedding_3').output
dec_lstm_layer = loaded.get_layer(name='decoder')
dec_softmax = loaded.get_layer(name='time_distributed_1')

# Encoder inference model
enc_model = Model(enc_input, enc_state)

# Decoder inference model
state_h_in = Input(shape = (64,))
state_c_in = Input(shape = (64,))
dec_state_in = [state_h_in, state_c_in]

dec_out_pred, state_h_out, state_c_out = dec_lstm_layer(dec_emb_layer, initial_state = dec_state_in)

dec_out_pred = dec_softmax(dec_out_pred)
dec_model = Model([dec_input] + dec_state_in, [dec_out_pred, state_h_out, state_c_out])