In [None]:
import string
import re
import numpy
from numpy import array, argmax, random, take
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

In [None]:
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

In [None]:
# split a text into sentences
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [None]:
data = read_text("tur.txt")
tur_eng = to_lines(data)
tur_eng = array(tur_eng)

In [None]:
tur_eng = tur_eng[:,:]

In [None]:
type(tur_eng)

In [None]:
tr_eng=[]
for i in tur_eng:
    i=i[:-1]
    tr_eng.append(i)

In [None]:
tur_eng = numpy.array(tr_eng)

In [None]:
tur_eng.shape

In [None]:
tur_eng[:,0] = [s.lower().translate(str.maketrans('', '', string.punctuation)) for s in tur_eng[:,0]]
tur_eng[:,1] = [s.lower().translate(str.maketrans('', '', string.punctuation)) for s in tur_eng[:,1]]

In [None]:
tur_eng

In [None]:
eng_l = []
tur_l = []

# populate the lists with sentence lengths
for i in tur_eng[:,0]:
    eng_l.append(len(i.split()))

for i in tur_eng[:,1]:
    tur_l.append(len(i.split()))

In [None]:
length_df = pd.DataFrame({'eng':eng_l, 'tur':tur_l})

In [None]:
length_df.hist(bins = 5)
plt.show()

In [None]:
# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# prepare english tokenizer
eng_tokenizer = tokenization(tur_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 10
print('English Vocabulary Size: %d' % eng_vocab_size)

In [None]:
# prepare Deutch tokenizer
tur_tokenizer = tokenization(tur_eng[:, 1])
tur_vocab_size = len(tur_tokenizer.word_index) + 1

tur_length = 10
print('Turkish Vocabulary Size: %d' % tur_vocab_size)

In [None]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [None]:
train, test = train_test_split(tur_eng, test_size=0.2, random_state = 12)

In [None]:
# prepare training data
trainX = encode_sequences(tur_tokenizer, tur_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

In [None]:
# prepare validation data
testX = encode_sequences(tur_tokenizer, tur_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [None]:
# build NMT model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

In [None]:
model = build_model(tur_vocab_size, eng_vocab_size, tur_length, eng_length, 128)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
filename = 'model.h1.9_jl_20'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(
        trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=20, batch_size=1024, 
          validation_split = 0.1,
          callbacks=[checkpoint], verbose=1)

In [None]:
#model = load_model('model.h1.9_jl_20')

In [None]:
vector = encode_sequences(tur_tokenizer, tur_length, ["bu durum beni mutlu etti"])

In [None]:
preds = model.predict_classes(vector)

In [None]:
#preds.shape

In [None]:
#testY.shape

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None


In [None]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
             
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)            
        
    preds_text.append(' '.join(temp))

In [None]:
preds_text

In [None]:
new_pred=model.predict(str_pred)

In [None]:
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [None]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
             
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)            
        
    preds_text.append(' '.join(temp))

In [None]:
preds_text[0]