In [None]:
!pip install tensorflow==1.12.0

In [None]:
import pandas as pd
import numpy as np

In [None]:
DATASET_FILE_NAME_CONVERSATIONS = \
    "../input/movie-dialog-corpus/movie_conversations.tsv"
DATASET_COLUMNS_CONVERSATIONS = \
    ["character1_id", "character2_id", "movie_id", "conversation_list"]
DATASET_FILE_NAME_LINES = "../input/movie-dialog-corpus/movie_lines.tsv"
DATASET_COLUMNS_LINES = \
    ["line_id", "character_id", "movie_id", "charcter_name", "text"]
DATASET_ENCODING = "ISO-8859-1"
DATASET_SEPARATOR = "\t"

conversations_df = pd.read_csv(
    DATASET_FILE_NAME_CONVERSATIONS,
    sep=DATASET_SEPARATOR,
    encoding=DATASET_ENCODING,
    names=DATASET_COLUMNS_CONVERSATIONS,
    engine="python")
lines_df = pd.read_csv(
    DATASET_FILE_NAME_LINES,
    sep=DATASET_SEPARATOR,
    encoding=DATASET_ENCODING,
    names=DATASET_COLUMNS_LINES,
    index_col=0,
    error_bad_lines=False,
    warn_bad_lines=False,
    engine="python")

In [None]:
conversations_df.head()

In [None]:
lines_df.head()

In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"'#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [None]:
lines_df.text = lines_df.text.apply(lambda x: clean_text(str(x)))

In [None]:
SEQUENCE_LENGTH = 30

enc_input = []
dec_input = []

def convert_str_to_list(s):
    s = s.strip("['")
    s = s.strip("']")
    return s.split("' '")

for c_i, row in conversations_df.iterrows():
    conversation_list = convert_str_to_list(row.conversation_list)
    for i in range(len(conversation_list) - 1):
        try:
            x_id = conversation_list[i]
            y_id = conversation_list[i+1]
            x = lines_df.at[x_id, "text"]
            y = lines_df.at[y_id, "text"]
            if len(x) > SEQUENCE_LENGTH or len(y) > SEQUENCE_LENGTH:
              continue
            enc_input.append("<s> {} </s>".format(x))
            dec_input.append("<s> {} </s>".format(y))
        except KeyError:
            # nothing to do
            pass

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters="")
tokenize_texts = enc_input[:] + dec_input[:]
tokenizer.fit_on_texts(pd.Series(tokenize_texts))

In [None]:
import pickle

with open("seq2seq_tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

e = pad_sequences(
            tokenizer.texts_to_sequences(enc_input),
            padding="post",
            maxlen=SEQUENCE_LENGTH)
d = pad_sequences(
            tokenizer.texts_to_sequences(dec_input),
            padding="post",
            maxlen=SEQUENCE_LENGTH)

In [None]:
n_split = int(len(enc_input) * 0.8)
e_train,e_test=np.vsplit(e,[n_split])
d_train,d_test=np.vsplit(d,[n_split])

In [None]:
train_target = np.hstack(
                (d_train[:, 1:], np.zeros((len(d_train), 1),
                dtype=np.int32)))

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM

EMBEDDING_DIM = 256
HIDDEN_DIM = 256

vocab_size = len(tokenizer.word_index) + 1

encoder_inputs = Input(shape=(SEQUENCE_LENGTH,))
encoder_embedded = Embedding(
                    vocab_size,
                    EMBEDDING_DIM,
                    mask_zero=True)(encoder_inputs)
_, *encoder_states = LSTM(HIDDEN_DIM, return_state=True)(encoder_embedded)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

decoder_inputs = Input(shape=(SEQUENCE_LENGTH,))
decoder_embedding_layer = Embedding(vocab_size, EMBEDDING_DIM)
decoder_embedded = decoder_embedding_layer(decoder_inputs)
decoder_lstm_layer = LSTM(HIDDEN_DIM, return_sequences=True, return_state=True)
decoder_outputs, *_ = decoder_lstm_layer(
                        decoder_embedded,
                        initial_state=encoder_states)
decoder_dense_layer = Dense(vocab_size, activation="softmax")
decoder_outputs = decoder_dense_layer(decoder_outputs)

In [None]:
decoder_outputs, *_ = decoder_lstm_layer(
                        decoder_embedded,
                        initial_state=encoder_states)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="rmsprop",
              metrics=["accuracy"])

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, to_file="seq2seq_model.png")

In [None]:
model.fit(
        [encoder_train, decoder_train],
        np.expand_dims(target_train, -1),
        batch_size=128,
        epochs=15,
        verbose=2,
        validation_split=0.2)

In [None]:
model.save("seq2seq.h5")
print("model saved as seq2seq.h5")

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = Input(shape=(1,))
decoder_embedded = decoder_embedding_layer(decoder_inputs)
decoder_states_inputs = [
                Input(shape=(HIDDEN_DIM,)),
                Input(shape=(HIDDEN_DIM,))]
decoder_lstm, *decoder_states = decoder_lstm_layer(
                                    decoder_embedded,
                                    initial_state=decoder_states_inputs)
decoder_outputs = decoder_dense_layer(decoder_lstm)
decoder_model = Model(
                    [decoder_inputs] + decoder_states_inputs,
                    [decoder_outputs] + decoder_states)

In [None]:
word2index = tokenizer.word_index
index2word = dict(map(reversed, word2index.items()))
bos = [word2index["<s>"]]
eos = [word2index["</s>"]]

MAX_OUTPUT_LENGTH = 100

def decode_sequence(input_seq):
    formated_input_seq = "<s> {} </s>".format(clean_text(input_seq))
    tokenized_input_seq = pad_sequences(
        tokenizer.texts_to_sequences([formated_input_seq]),
        padding="post",
        maxlen=max_seq_len)

    states = encoder_model.predict(tokenized_input_seq)

    target = np.array(bos)
    output_seq = bos

    for i in range(MAX_OUTPUT_LENGTH):
        tokens, *states = decoder_model.predict([target] + states)
        output_index = [np.argmax(tokens[0, -1, :])]
        output_seq += output_index
        if output_index == eos:
            break
        target = np.array(output_index)

    output_seq = "　".join([index2word[i]
                       for i in output_seq if i not in bos + eos])
    return output_seq

In [None]:
input_seq = "How are you?"
print(decode_sequence(input_seq))