In [None]:

import string
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import helper
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
eng_df=pd.read_csv("/transltn_english.txt","utf-8",header=None,names=["English"])
fre_df=pd.read_csv("/translt_french.txt","utf-8",header=None,names=["French"])
ger_df=pd.read_csv("/transltn_german.txt","utf-8",header=None,names=["German"])
span_df=pd.read_csv("/transltn_spanish.txt","utf-8",header=None,names=["Spanish"])

  """Entry point for launching an IPython kernel.


FileNotFoundError: ignored

In [None]:
translate_table= dict((ord(char),None)for char in string.punctuation)
data_eng=[]
lang_eng=[]
data_ger=[]
lang_ger=[]
data_fre=[]
lang_fre=[]
data_span=[]
lang_span=[]

In [None]:
j=1
for i,line in eng_df.iterrows():
  line=line['English']
  if len(line) !=0:
    line=line.lower()
    if j>0:
      print(line)
    line=re.sub(r"\d+","",line)
    if j>0:
      print(line)
      j=j-1
    line=line.translate(translate_table)
    data_eng.append(line)
    lang_eng.append("English")



In [None]:
j=1
for i,line in fre_df.iterrows():
  line=line['French']
  if len(line) !=0:
    line=line.lower()
    if j>0:
      print(line)
    line=re.sub(r"\d+","",line)
    if j>0:
      print(line)
      j=j-1
    line=line.translate(translate_table)
    data_fre.append(line)
    lang_fre.append("French")

In [None]:
j=1
for i,line in ger_df.iterrows():
  line=line['German']
  if len(line) !=0:
    line=line.lower()
    if j>0:
      print(line)
    line=re.sub(r"\d+","",line)
    if j>0:
      print(line)
      j=j-1
    line=line.translate(translate_table)
    data_ger.append(line)
    lang_ger.append("German")

In [None]:
j=1
for i,line in span_df.iterrows():
  line=line['Spanish']
  if len(line) !=0:
    line=line.lower()
    if j>0:
      print(line)
    line=re.sub(r"\d+","",line)
    if j>0:
      print(line)
      j=j-1
    line=line.translate(translate_table)
    data_span.append(line)
    lang_span.append("Spanish")

In [None]:
df=pd.DataFrame({"Text":data_eng + data_ger + data_fre + data_span,
                 "language":lang_eng + lang_ger + lang_fre + lang_span})

In [None]:
english_words_counter = collections.Counter([word for sentence in data_eng for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in data_fre for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in data_eng for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in data_fre for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer


In [None]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    return pad_sequences(x, maxlen=length, padding='post')


In [None]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(data_eng, data_fre)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

In [None]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])


In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a RNN model using word embedding on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement

    # Hyperparameters
    learning_rate = 0.01
    
    # TODO: Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256, return_sequences=True))    
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [None]:
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
tmp_x=tmp_x[:4190]
print(len(tmp_x))
print(len(preproc_french_sentences))

In [None]:
embed_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

embed_rnn_model.summary()



In [None]:
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

# TODO: Print prediction(s)
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))
print("Prediction:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fre[:1])

print("\nOriginal text:")
print(data_eng[:1])

In [None]:
embed_rnn_model.save('translation.h5')
!ls


In [None]:
shape_m1 = embed_rnn_model.to_json() # shape of model
with open('embed_rnn_model.json','w') as myFile:
  myFile.write(shape_m1)

embed_rnn_model.save_weights('embed_rnn_modelweights.h5')
!ls

In [None]:
german_words_counter = collections.Counter([word for sentence in data_ger for word in sentence.split()])
spanish_words_counter = collections.Counter([word for sentence in data_span for word in sentence.split()])
print('{} germa words.'.format(len([word for sentence in data_ger for word in sentence.split()])))
print('{} unique English words.'.format(len(german_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in data_span for word in sentence.split()])))
print('{} unique French words.'.format(len(spanish_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

In [None]:
preproc_english_sentences, preproc_german_sentences, english_tokenizer, german_tokenizer =\
    preprocess(data_eng, data_ger)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_german_sequence_length = preproc_german_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
german_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max German sentence length:", max_german_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("german vocabulary size:", french_vocab_size)

In [None]:
preproc_english_sentences, preproc_spanish_sentences, english_tokenizer, spanish_tokenizer =\
    preprocess(data_eng, data_span)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_spanish_sequence_length = preproc_german_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
spanish_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max spanish sentence length:", max_spanish_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("spanish vocabulary size:", spanish_vocab_size)



In [None]:
tmp_y = pad(preproc_english_sentences, preproc_german_sentences.shape[1])
tmp_y = tmp_y.reshape((-1, preproc_german_sentences.shape[-2]))
print(len(tmp_y))
print(len(preproc_german_sentences))

tmp_z = pad(preproc_english_sentences, preproc_spanish_sentences.shape[1])
tmp_z = tmp_z.reshape((-1, preproc_spanish_sentences.shape[-2]))
print(len(tmp_z))
print(len(preproc_spanish_sentences))


In [None]:
embed_rnn_model_span = embed_model(
    tmp_y.shape,
    preproc_spanish_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(spanish_tokenizer.word_index)+1)

embed_rnn_model_span.summary()

In [None]:

embed_rnn_model_span.fit(tmp_z, preproc_spanish_sentences, batch_size=1024, epochs=20, validation_split=0.2)
# TODO: Print prediction(s)


In [None]:


print(logits_to_text(embed_rnn_model_span.predict(tmp_x[:1])[0], spanish_tokenizer))
print("Prediction:")
print(logits_to_text(embed_rnn_model_span.predict(tmp_x[:1])[0], spanish_tokenizer))

print("\nCorrect Translation:")
print(data_span[:1])

print("\nOriginal text:")
print(data_eng[:1])

In [None]:
embed_rnn_model_ger = embed_model(
    tmp_y.shape,
    preproc_german_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(german_tokenizer.word_index)+1)

embed_rnn_model_ger.summary()
embed_rnn_model_ger.fit(tmp_y, preproc_german_sentences, batch_size=1024, epochs=20, validation_split=0.2)

In [None]:
print(logits_to_text(embed_rnn_model_ger.predict(tmp_x[:1])[0], german_tokenizer))
print("Prediction:")
print(logits_to_text(embed_rnn_model_ger.predict(tmp_x[:1])[0], german_tokenizer))

print("\nCorrect Translation:")
print(data_ger[:1])

print("\nOriginal text:")
print(data_eng[:1])


In [None]:
embed_rnn_model_ger.save('german.h5')
embed_rnn_model_span.save('spanish.h5')

shape_m2 =embed_rnn_model_ger.to_json() # shape of model
with open('embed_rnn_model_ger.json','w') as myFile:
  myFile.write(shape_m2)

embed_rnn_model_ger.save_weights('german_weights.h5') 

shape_m3 = embed_rnn_model_span.to_json() # shape of model
with open('embed_rnn_model_span.json','w') as myFile:
  myFile.write(shape_m3)

embed_rnn_model_span.save_weights('spanish_weights.h5') 

In [None]:
from tensorflow.keras.models import model_from_json

In [None]:
X=tmp_y
Y=preproc_german_sentences

json_file = open('embed_rnn_model_ger.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("german.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
tmp_y[:5]

In [None]:
X=tmp_z
Y=preproc_spanish_sentences

json_file = open('embed_rnn_model_span.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("spanish.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
c='katze'
k=0
l=1
flag=0
for j in data_ger:
  words=[]
  for word in j.split():
    if word != '.':
      word = word.replace('.','')
      words.append(word)
  for i in words:
    k=k+1
    if i==c:
      if flag==0:
        print(i)
        K=k
        L=l
        print("word number=",k)
        print("sentence number=",l)
        flag=1
  l=l+1

In [None]:
print(L,K)
for u in data_eng:
  L=L-1 
  if L==0:
    print("L:",L)
    for v in u.split():
      words1=[]
      if v != '.':
        v = v.replace('.','')
        words1.append(v)
        print("words1:",words1)
      for n in words1:
        K=K-1
        if K==0:
          b=n
          print(b)



