In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
import spacy
nlp = spacy.load('en_core_web_md')

In [3]:
# load lines dictionary 
lines = open('data/chatbot/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

# load conversations
convs = open('data/chatbot/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

print('\n'.join(lines[:3]))
print()
print('\n'.join(convs[:3]))

w2idx = {}
idx2w = {}
w2idx['<UNK>'], idx2w[0] = 0, '<UNK>'
w2idx['<BEG>'], idx2w[1] = 1, '<BEG>'
w2idx['<END>'], idx2w[2] = 2, '<END>'

INDEX = 3

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']


## Text Preprocessing

here I define some methods to 1.extract data from input files, 2.seperate encode and decode data, 3.prepare and tokenized data for training

In [4]:
def get_lines(lines, line_num):
    line_dict = [None] * line_num 
    for line in lines:
        line = line.split(' +++$+++ ')
        if len(line) < 3:
            continue
        idx = line[0].split('L')[1]
        line_dict[int(idx)] = line[-1]
    
    return line_dict
    
def get_convs(convers):
    result = []
    for i in range(len(convers)):
        conv = convers[i].split("'")[1:-1]
        conv = [ (int)(a.split('L')[1]) for a in conv if len(a)>3 ]
        result.append(conv)
        
    return result

def remove_unknown_words(lines):
    dat = []
    n = len(lines)
    
    for i in range(n):
        if lines[i] is None or len(lines[i])<1:
            dat.append(None)
            continue
            
        s = lines[i]
        s = ' '.join([w.lower() if w in nlp.vocab else '<UNK>' for w in s.split() ])
        #s = ' '.join([w for w in s.split() if w in nlp.vocab])
        s = '<BEG> '+s+' <END>'
        
        dat.append(s)
    
    return dat

def cut_and_tokenize(lines, index, length=None,):
    dat = []
    n = len(lines)
    str_length = []
    
    for i in range(n):
        if lines[i] is None or '<UNK>' in lines[i] or len(lines[i])<1:
            dat.append(None)
            continue
            
        s = lines[i].split()
        str_length.append(len(s))
        
        for i in range(len(s)):
            if s[i] in w2idx:
                s[i] = w2idx[s[i]]
            else:
                w2idx[s[i]] = index
                idx2w[index] = s[i]
                s[i] = index
                index += 1
                
        if length is not None:
#            s = s + ['<PAD>']*(length-len(s))
            s = s[:length]
        
        dat.append(s)
    
    print("max length : ", max(str_length))
    print("avg length : ", sum(str_length)/len(str_length))
    return dat, index

def sep_enc_dec(conv_set, corpus):
    enc, dec = [], []
    for conv in conv_set:
        for i in range(len(conv)-1):
            if corpus[conv[i]] is None or corpus[conv[i+1]] is None:
                continue
            enc.append(conv[i])
            dec.append(conv[++i])
            
    return enc, dec

### Details of preprocessing sentences
1. I build a dictionary to tokenize data. 
2. To simplify training, I remove all sentences that contain words that are not listed in nlp.
3. Add 'BEG' and 'END' tag to each sentence
4. Remove any encode-decode pair if any of them contain 'UNK'

In [5]:
max_len = 75
parsed_lines = get_lines(lines, 670000)
parsed_lines = remove_unknown_words(parsed_lines)
corpus, INDEX = cut_and_tokenize(parsed_lines, INDEX, max_len)

max length :  75
avg length :  8.405328552883724


In [6]:
conv_set = get_convs(convs)
np.random.shuffle(conv_set)
enc_set, dec_set = sep_enc_dec(conv_set, corpus)
print(len(enc_set))

11462


In [7]:
w2idx['<PAD>'] = INDEX
idx2w[INDEX] = '<PAD>'
INDEX += 1
print("vocab size : ", INDEX)

vocab size :  15994


In [8]:
del nlp
del parsed_lines
del lines
del convs
del conv_set

### Batch data
pad every sentence that is shorter than max_length, batch encode and decode data

In [9]:
class BatchGenerator:
    def __init__(self, enc, dec, corpus, max_len, batch_size):
        assert len(enc) == len(dec)
        
        self.batch_num = len(enc)//batch_size
        n = self.batch_num*batch_size
        print(n)
        
        self.xs = [np.zeros(n, dtype=np.int16) for _ in range(max_len)] # encoder inputs
        self.ys = [np.zeros(n, dtype=np.int16) for _ in range(max_len)] # decoder inputs
        self.gs = [np.zeros(n, dtype=np.int16) for _ in range(max_len)] # decoder outputs
        self.ws = [np.zeros(n, dtype=np.float16) for _ in range(max_len)] # decoder weight for loss caculation
        
        self.max_len = max_len
        self.batch_size = batch_size
        pad = w2idx['<PAD>']
        
        for b in range(self.batch_num):
            for i in range(b*batch_size, (b+1)*batch_size):
                enc_corpus = corpus[ enc[i] ]
                dec_corpus = corpus[ dec[i] ]
                for j in range(len(enc_corpus)-2):
                    self.xs[j][i] = enc_corpus[j+1]
                for j in range(j+1, max_len):
                    self.xs[j][i] = pad
                
                for j in range(len(dec_corpus)-1):
                    self.ys[j][i] = dec_corpus[j]
                    self.gs[j][i] = dec_corpus[j+1]
                    self.ws[j][i] = 1.0

                for j in range(j+1, max_len): # don't forget padding and let loss weight zero
                    self.ys[j][i] = pad
                    self.gs[j][i] = pad
                    self.ws[j][i] = 0.0
    
    def get(self, batch_id):
        x = [self.xs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.max_len)]
        y = [self.ys[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.max_len)]
        g = [self.gs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.max_len)]
        w = [self.ws[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.max_len)]
        
        return x, y, g, w

In [10]:
batch_size = 4
bath_generator = BatchGenerator(enc_set, dec_set, corpus, max_len, batch_size)

11460


In [11]:
del enc_set, dec_set, corpus

In [12]:
class Seq2Seq:
    def __init__(self, enc_max_len, dec_max_len, vocab_size):
        self.enc_max_len = enc_max_len
        self.dec_max_len = dec_max_len
        
        with tf.variable_scope('seq2seq_intput/output'):
            self.enc_inputs = [tf.placeholder(tf.int16, [None]) for i in range(enc_max_len)] # time mojor feed
            self.dec_inputs = [tf.placeholder(tf.int16, [None]) for i in range(dec_max_len)]
            self.groundtruths = [tf.placeholder(tf.int16, [None]) for i in range(enc_max_len)]
            self.weights = [tf.placeholder(tf.float16, [None]) for i in range(dec_max_len)]
            
        with tf.variable_scope('seq2seq_rnn'): # training by teacher forcing
            self.out_cell = tf.contrib.rnn.LSTMCell(256)
            self.outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                    self.out_cell, 
                                                                                    vocab_size, vocab_size, 200)
        with tf.variable_scope('seq2seq_rnn', reuse=True): # predict by feeding previous
            self.pred_cell = tf.contrib.rnn.LSTMCell(256, reuse=True) # reuse cell for train and test
            self.predictions, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                        self.pred_cell, 
                                                                                        vocab_size, vocab_size, 200, 
                                                                                        feed_previous=True)
        
        with tf.variable_scope('loss'):
            # caculate weighted loss
            self.loss = tf.reduce_mean(tf.contrib.legacy_seq2seq.sequence_loss_by_example(self.outputs, 
                                                                                          self.groundtruths, 
                                                                                          self.weights))
            self.optimizer = tf.train.AdamOptimizer(0.002).minimize(self.loss)
        
        config = tf.ConfigProto()
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
    
    def train(self, x, y, g, w):
        fd = {}
        for i in range(self.enc_max_len):
            fd[self.enc_inputs[i]] = x[i] # show how to feed a list
        
        for i in range(self.dec_max_len):
            fd[self.dec_inputs[i]] = y[i]
            fd[self.groundtruths[i]] = g[i]
            fd[self.weights[i]] = w[i]
        
        loss, _ = self.sess.run([self.loss, self.optimizer], fd)
        
        return loss

    def output(self, x, y):
        fd = {}
        for i in range(self.enc_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.dec_max_len):
            fd[self.dec_inputs[i]] = y[i]
        
        out = self.sess.run(self.outputs, fd)
        
        return out
    
    def predict(self, x, dec_beg):
        fd = {}
        for i in range(self.enc_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.dec_max_len): # when feed previous, the fist token should be '<BEG>', and others are useless
            if i==0:
                fd[self.dec_inputs[i]] = np.ones(y[i].shape, dtype=np.int32)*dec_beg
            else:
                fd[self.dec_inputs[i]] = np.zeros(y[i].shape, dtype=np.int32)
        
        pd = self.sess.run(self.predictions, fd)
        
        return pd
    
    def save(self, e):
        self.saver.save(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e+1))
    
    def restore(self, e):
        self.saver.restore(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e))

In [None]:
tf.reset_default_graph()
model = Seq2Seq(max_len, max_len, INDEX)
EPOCHS = 40
batch_num = bath_generator.batch_num
rec_loss = []
for e in range(EPOCHS):
    train_loss = 0
    
    for b in range(batch_num):
        x, y, g, w = bath_generator.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss
        print(b, end=" ")
    
    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("\nepoch %d loss: %f" % (e, train_loss))
    
    model.save(e)
    
np.save('./model/seq2seq/rec_loss.npy', rec_loss)

I haven't successfully train a chatbot because my kernal always crush halfway through training. I guess there's something I did wrong that make my model taking up too much resource or become unstable.