In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model,Sequential
from keras.layers import Input,Dense,Bidirectional,GRU,LSTM,Embedding,Dropout,Flatten
from keras.layers.convolutional import Convolution2D,MaxPooling2D,ZeroPadding2D
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping,ModelCheckpoint, TensorBoard  
from keras import backend as K  
import numpy as np
import pandas as pd

Using TensorFlow backend.


**DATA PREPROCESSING**

In [0]:
df = pd.read_csv('train.csv')
x=df['x']
y = df['y']

print("sample input  :",x[35])
print("sample output :", y[35])

sample input  : Me.  This endless ...blonde babble. I'm like, boring myself.
sample output : Thank God!  If I had to hear one more story about your coiffure...


In [0]:
mark_start = 'ssss '
mark_end = ' eeee'
def mark_captions(captions_list):
    captions_marked = [mark_start + caption + mark_end for caption in captions_list]
    return captions_marked

y = mark_captions(y)
print("input : ",x[35])
print("output : ",y[35])

input :  Me.  This endless ...blonde babble. I'm like, boring myself.
output :  ssss Thank God!  If I had to hear one more story about your coiffure... eeee


In [0]:
def reverseSen(Sentence):
  words = Sentence.split(" ") 
  newWords = [words[i] for i in range(len(words)-1,-1,-1)] 
  newSentence = " ".join(newWords) 
  return newSentence
input_x = [reverseSen(each) for each in x]
#input_x = np.array(input_x)
output_y = y
#output_y = np.array(output_y)
print("reversing input : ",input_x[35])
print("input shape : ",len(input_x))
print("output : ",output_y[35])
print("output shape : ",len(output_y))

reversing input :  myself. boring like, I'm babble. ...blonde endless This  Me.
input shape :  221282
output :  ssss Thank God!  If I had to hear one more story about your coiffure... eeee
output shape :  221282


In [0]:
data_text = input_x+output_y
num_words = 10000
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(data_text)


x_tokens = tokenizer.texts_to_sequences(input_x)
y_tokens = tokenizer.texts_to_sequences(output_y)


print("tokenized input : " , x_tokens[35])
print("tokenized output : ", y_tokens[35])

tokenized input :  [308, 1796, 38, 20, 2294, 7117, 17, 14]
tokenized output :  [2, 235, 196, 43, 4, 99, 6, 241, 56, 113, 389, 39, 23, 1]


In [0]:
lens = [len(each) for each in x_tokens+y_tokens]
lens = np.array(lens)

max_tokens = np.mean(lens) + 2*np.std(lens)
max_tokens = int(max_tokens)
print(max_tokens)

33


In [0]:
pad = 'pre'
x_pad = pad_sequences(x_tokens,maxlen = max_tokens,padding = pad,truncating = pad)
pad = 'post'
y_pad = pad_sequences(y_tokens,maxlen = max_tokens,padding = pad,truncating = pad)

encoder_in_data = x_pad
decoder_in_data = y_pad
decoder_output_data = y_pad[:,1:]

decoder_out_data = [np.insert(each,len(each),0) for each in decoder_output_data]


print("padded encoder input : ",encoder_in_data[35])
print("padded decoder input : ",decoder_in_data[35])
print("padded decoder output : ",decoder_out_data[35])

padded encoder input :  [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0  308 1796   38
   20 2294 7117   17   14]
padded decoder input :  [  2 235 196  43   4  99   6 241  56 113 389  39  23   1   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
padded decoder output :  [235 196  43   4  99   6 241  56 113 389  39  23   1   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [0]:
for i in range(0,len(decoder_in_data)):
  for j in range(0,len(decoder_in_data[i])):
    if( decoder_in_data[i][j] == 1):
      decoder_in_data[i][j] = 0
decoder_in_data[35]

array([  2, 235, 196,  43,   4,  99,   6, 241,  56, 113, 389,  39,  23,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [0]:
encoder_in_data = np.array(encoder_in_data)
decoder_in_data = np.array(decoder_in_data)
decoder_out_data = np.array(decoder_out_data)
print("encoder input shape",encoder_in_data.shape)
print("decoder input shape",decoder_in_data.shape)
print("decoder output shape",decoder_out_data.shape)

encoder input shape (221282, 33)
decoder input shape (221282, 33)
decoder output shape (221282, 33)


In [0]:
x_data = \
{
    'encoder_input': encoder_in_data,
    'decoder_input': decoder_in_data
}

y_data = \
{
    'decoder_output': decoder_out_data
}

**ENCODER DECODER MODEL**

In [0]:
encoder_input = Input(shape=(None, ), name='encoder_input')
embedding_size = 128
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')
state_size = 512
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

In [0]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output
encoder_output = connect_encoder()


In [0]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')
decoder_input = Input(shape=(None, ), name='decoder_input')
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

In [0]:

def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output
decoder_output = connect_decoder(initial_state=encoder_output)

In [0]:
model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

In [0]:
import tensorflow as tf
def sparse_cross_entropy(y_true, y_pred):  
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

optimizer = RMSprop(lr=1e-3)
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

In [0]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])


W0821 10:36:41.382435 139885559637888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [0]:
model_train.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 128)    1280000     encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_gru1 (GRU)              (None, None, 512)    984576      encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_gr

In [0]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=15)

Epoch 1/15
 11776/221282 [>.............................] - ETA: 8:31 - loss: 1.2981

In [0]:
model_encoder.save('e.h5')
model_decoder.save('d.h5')

files.download('e.h5')
files.download('d.h5')

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


NameError: ignored

In [0]:
def reply(test):
    test = [test]
    test_tokens = tokenizer.texts_to_sequences(test)
    pad = 'pre'
    test_pad = pad_sequences(test_tokens,maxlen = max_tokens,padding = pad,truncating = pad)
    initial_state = model_encoder.predict(test_pad)
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)
    token_int = 2
    token_end = 1
    output = []
    count_tokens =0    
    while token_int != token_end and count_tokens<max_tokens:
        decoder_input_data[0,count_tokens] = token_int
        test_data = {'decoder_initial_state': initial_state,'decoder_input': decoder_input_data}
        decoder_output = model_decoder.predict(test_data)
        token_onehot = decoder_output[0, count_tokens, :]
        token_int = np.argmax(token_onehot)
        output.append(token_int)
        count_tokens+=1
    for i in range(0,len(output)-1):
        for word,index in tokenizer.word_index.items():
            if(index == output[i]) :
                print(word,end = ' ')
    print()

i=0
while(True):
    t = input('human: ')
    reply(t)


human: Hi life is good
good night 
human: good morning
good night 
human: what going on
i don't know 
human: i love you
i love you 
human: m going for the movie tomorrow
what 
human: yeah its cool right
what do you mean 
human: are you human
yes 
human: your name
i don't know 
human: what you dont know
i don't know 
human: its gonna be alright
i don't know what to do 
human: you must know 
no 
human: you should know
i don't know 
human: i like sanchita mittal
you know i don't know what you mean 
human: fuck off
what 
human: i said fuck off asshole
you don't have to do it 
human: i will do it right nwo
i don't think so 
human: i am telling you
what 
human: yeah be prepared
yeah 
human: mmm your attitude
i don't know what to say 
human: fuck off bitch
what 
human: colour you like most
yeah 
human: i like blue
and you have to tell me 
human: its okay sharing secret
i don't know 
human: tell me about you
i don't know 


KeyboardInterrupt: ignored