# ChatBot

## Data収集
日本語の学習用データを用意する。
* [開発データ・評価データ - 対話破綻検出チャレンジ2](https://sites.google.com/site/dialoguebreakdowndetection2/downloads)

In [2]:
import sys
import os
import json
import pickle

import numpy as np
import MeCab

ImportError: /home/tsu-nera/anaconda3/lib/python3.6/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.20' not found (required by /usr/lib/libmecab.so.2)

In [None]:
def load_json(filepath):
    fj = open(filepath,'r')
    json_data = json.load(fj)
    fj.close()
    return json_data

In [None]:
def output(data, mode):
    for i in range(len(data['turns'])):
        if mode == "U" and data['turns'][i]['speaker'] == mode:
            print(data['turns'][i]['utterance'])
        elif mode == "S" and data['turns'][i]['speaker'] == mode and i != 0:
            print(data['turns'][i]['utterance'])
        else:
            continue

In [None]:
def output_with_pair(data):
    for i in range(len(data['turns'])):
        print(data['turns'][i]["speaker"] + ":" + data['turns'][i]['utterance'])

In [None]:
data = load_json("DBDC2_dev/IRS/1471400435.log.json")

In [None]:
output(data, "U")

In [None]:
output(data, "S")

In [None]:
output_with_pair(data)

### Mecab で 文章を分解する
* [Ubuntu 16.04.1 LTSにPython 3(Anaconda)とMeCabをインストールする : 二日坊主な私](http://blueskydb.blog.jp/archives/67055421.html)

In [None]:
m = MeCab.Tagger("-Owakati")
def get_text(data):
    user = []
    system = []
    for i in range(len(data['turns'])):
        if data['turns'][i]['speaker'] == "U":
            user.append(m.parse(data['turns'][i]['utterance']))
        elif data['turns'][i]['speaker'] == "S" and i != 0:
            system.append(m.parse(data['turns'][i]['utterance']))
        else:
            continue
    return user, system

### データをすべて読み込み

In [None]:
source_text = []
target_text = []

import glob
for path in glob.glob('*/*/*.log.json'):
    data = load_json(path)
    user, system = get_text(data)
    source_text.extend(user)
    target_text.extend(system)

In [None]:
len(source_text), len(target_text)

### Data収集その2

In [2]:
source_text2 = []
target_text2 = []
m = MeCab.Tagger("-Owakati")

with open("sequence.txt",'r') as f:
    for row in f:
        if row.startswith("input:"):
            data = row[7:]
            data = m.parse(data)
            source_text2.append(data)
        else:
            data = row[8:]
            data = m.parse(data)
            target_text2.append(data)

In [67]:
source_text2 = source_text2[:10000]
target_text2 = target_text2[:10000]

## Preprocess

In [68]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    eos = target_vocab_to_int['<EOS>']
    source_id_text = [[source_vocab_to_int[word] for word in sequence.split()] 
                      for sequence in source_text]
    target_id_text = [[target_vocab_to_int[word] for word in sequence.split()] + [eos] 
                      for sequence in target_text]
    return source_id_text, target_id_text

In [69]:
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

In [70]:
import copy
def create_lookup_tables(text):
    vocab = set()
    for line in text:
        for word in line.split(" "):
                vocab.add(word)
    
    vocab_to_int = copy.copy(CODES)

    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

In [71]:
source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text2)
target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text2)

source_int_text, target_int_text = text_to_ids(source_text2, target_text2, 
                                               source_vocab_to_int, target_vocab_to_int)

In [72]:
len(source_vocab_to_int), len(target_vocab_to_int)

(6524, 5792)

In [8]:
# Save Data
with open('preprocess.p', 'wb') as out_file:
    pickle.dump((
        (source_text2, target_text2),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), out_file)

In [9]:
def load_preprocess():
    with open('preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)

## Build the Neural Network

In [10]:
import tensorflow as tf

In [11]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name="input")
    targets = tf.placeholder(tf.int32, [None, None], name="target")
    learning_rate = tf.placeholder(tf.float32, name="learning_rate")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    return inputs, targets, learning_rate, keep_prob

In [12]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    go = target_vocab_to_int['<GO>']
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], go), ending], 1)
    return dec_input

In [13]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob):
    enc_cell = tf.contrib.rnn.MultiRNNCell(
        [tf.contrib.rnn.BasicLSTMCell(rnn_size)] * num_layers)
    enc_cell = tf.contrib.rnn.DropoutWrapper(enc_cell, output_keep_prob=keep_prob)
    _, enc_state = tf.nn.dynamic_rnn(enc_cell, rnn_inputs, dtype=tf.float32)
    return enc_state

In [14]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length,
                         decoding_scope,
                         output_fn, keep_prob):
    train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state)
    train_pred, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(
        dec_cell, train_decoder_fn, dec_embed_input, sequence_length, scope=decoding_scope)

    train_logits =  output_fn(train_pred)
    train_logits = tf.nn.dropout(train_logits, keep_prob)

    return train_logits

In [15]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id,
                         maximum_length, vocab_size, decoding_scope, output_fn, keep_prob):

    infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(
        output_fn, encoder_state, dec_embeddings, start_of_sequence_id, end_of_sequence_id, 
        maximum_length - 1, vocab_size)
    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell,
                                                                    infer_decoder_fn,
                                                                    scope=decoding_scope)
    inference_logits = tf.nn.dropout(inference_logits, keep_prob)
    return inference_logits

In [16]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, 
                   sequence_length, rnn_size,
                   num_layers, target_vocab_to_int, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    dec_cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
    
    with tf.variable_scope('decoding') as decoding_scope:
        output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, 
                                                                tf.nn.softmax, scope=decoding_scope)
        
        train_logits = decoding_layer_train(encoder_state, dec_cell, 
                                            dec_embed_input, sequence_length,
                                            decoding_scope, output_fn, keep_prob)
        
    with tf.variable_scope('decoding', reuse=True) as decoding_scope:
        infer_logits = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings,
                                            source_vocab_to_int['<GO>'], 
                                            source_vocab_to_int['<EOS>'],
                                            sequence_length, vocab_size, decoding_scope,
                                            output_fn, keep_prob)

    return train_logits, infer_logits

In [17]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, rnn_size, num_layers, 
                  target_vocab_to_int):
    
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size,
                                                       enc_embedding_size)
    enc_state = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob=keep_prob)
    target_data = process_decoding_input(target_data, target_vocab_to_int, batch_size)
    dec_embed = tf.Variable(tf.random_uniform([target_vocab_size, dec_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embed, target_data)

    dec_layer = decoding_layer(dec_embed_input, dec_embed, enc_state, target_vocab_size, 
                               sequence_length,
                               rnn_size, num_layers, target_vocab_to_int, keep_prob)
    
    return dec_layer

## Neural Network Training

In [73]:
# Number of Epochs
epochs = 1
# Batch Size
batch_size = 100
# RNN Size
rnn_size = 64
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 128
decoding_embedding_size = 128
# Learning Rate
learning_rate = 0.01
# Dropout Keep Probability
keep_probability = 0.5

In [74]:
save_path = 'checkpoints/dev'

max_source_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob = model_inputs()
    sequence_length = tf.placeholder_with_default(max_source_sentence_length, 
                                                  None, name='sequence_length')
    input_shape = tf.shape(input_data)
    
    train_logits, inference_logits = seq2seq_model(
        tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length,
        len(source_vocab_to_int), len(target_vocab_to_int),
        encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, 
        target_vocab_to_int)

    tf.identity(inference_logits, 'logits')
    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            train_logits,
            targets,
            tf.ones([input_shape[0], sequence_length]))

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) 
                            for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [75]:
import time

def get_accuracy(target, logits):
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1]), (0,0)],
            'constant')

    return np.mean(np.equal(target, np.argmax(logits, 2)))

train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]

def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [CODES['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

valid_source = pad_sentence_batch(source_int_text[:batch_size])
valid_target = pad_sentence_batch(target_int_text[:batch_size])

def batch_data(source, target, batch_size):
    for batch_i in range(0, len(source)//batch_size):
        start_i = batch_i * batch_size
        source_batch = source[start_i:start_i + batch_size]
        target_batch = target[start_i:start_i + batch_size]
        yield np.array(pad_sentence_batch(source_batch)), np.array(pad_sentence_batch(target_batch))

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch) in enumerate(
                batch_data(train_source, train_target, batch_size)):
            start_time = time.time()
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 sequence_length: target_batch.shape[1],
                 keep_prob: keep_probability})
            
            batch_train_logits = sess.run(
                inference_logits,
                {input_data: source_batch, keep_prob: 1.0})
            batch_valid_logits = sess.run(
                inference_logits,
                {input_data: valid_source, keep_prob: 1.0})
                
            train_acc = get_accuracy(target_batch, batch_train_logits)
            valid_acc = get_accuracy(np.array(valid_target), batch_valid_logits)
            end_time = time.time()
            print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.3f}, Validation Accuracy: {:>6.3f}, Loss: {:>6.3f}'
                  .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')

Epoch   0 Batch    0/100 - Train Accuracy:  0.939, Validation Accuracy:  0.930, Loss:  8.664
Epoch   0 Batch    1/100 - Train Accuracy:  0.930, Validation Accuracy:  0.930, Loss:  8.664
Epoch   0 Batch    2/100 - Train Accuracy:  0.939, Validation Accuracy:  0.930, Loss:  8.664
Epoch   0 Batch    3/100 - Train Accuracy:  0.939, Validation Accuracy:  0.930, Loss:  8.664
Epoch   0 Batch    4/100 - Train Accuracy:  0.937, Validation Accuracy:  0.930, Loss:  8.664
Epoch   0 Batch    5/100 - Train Accuracy:  0.942, Validation Accuracy:  0.930, Loss:  8.663
Epoch   0 Batch    6/100 - Train Accuracy:  0.946, Validation Accuracy:  0.930, Loss:  8.662
Epoch   0 Batch    7/100 - Train Accuracy:  0.941, Validation Accuracy:  0.930, Loss:  8.658
Epoch   0 Batch    8/100 - Train Accuracy:  0.938, Validation Accuracy:  0.930, Loss:  8.647
Epoch   0 Batch    9/100 - Train Accuracy:  0.948, Validation Accuracy:  0.930, Loss:  8.624
Epoch   0 Batch   10/100 - Train Accuracy:  0.939, Validation Accuracy

In [76]:
import pickle
def save_params(params):
    with open('params.p', 'wb') as out_file:
        pickle.dump(params, out_file)

def load_params():
    with open('params.p', mode='rb') as in_file:
        return pickle.load(in_file)

In [77]:
save_params(save_path)

## Predict

In [78]:
import tensorflow as tf
import numpy as np
import helper

load_path = helper.load_params()

In [79]:
def sentence_to_seq(sentence, vocab_to_int):
    m = MeCab.Tagger("-Owakati")
    unk = vocab_to_int['<UNK>']
    sentence = m.parse(sentence)
    return [vocab_to_int.get(w, unk) for w in sentence.split()]

In [80]:
translate_sentence = 'おススメの映画はありますか？'

translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('logits:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

    translate_logits = sess.run(logits, {input_data: [translate_sentence], keep_prob: 1.0})[0]
    
print('Input')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  User Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in np.argmax(translate_logits, 1)]))
print('  System Words: {}'.format([target_int_to_vocab[i] for i in np.argmax(translate_logits, 1)]))

Input
  Word Ids:      [5782, 2, 5620, 4508, 828, 380, 2397, 2843, 4637]
  User Words: ['お', '<UNK>', 'の', '映画', 'は', 'あり', 'ます', 'か', '？']

Prediction
  Word Ids:      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  System Words: ['<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>

In [81]:
def user_input(sentence):
    sentence = sentence_to_seq(sentence, source_vocab_to_int)
    print(sentence)

    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
        # Load saved model
        loader = tf.train.import_meta_graph(load_path + '.meta')
        loader.restore(sess, load_path)

        input_data = loaded_graph.get_tensor_by_name('input:0')
        logits = loaded_graph.get_tensor_by_name('logits:0')
        keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

        translate_logits = sess.run(logits, {input_data: [sentence], keep_prob: 1.0})[0]
        
    return ''.join([target_int_to_vocab[i] for i in np.argmax(translate_logits, 1)])

In [82]:
user_input("夏といえば海だね")

[5950, 4834, 1165, 4985, 5714, 436, 39]


'<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

In [None]:
user_input("君は泳げるのかい？")

In [None]:
user_input("海といえば、なんですか？")

In [None]:
user_input("山はどうだい？")

In [None]:
user_input("こんにちは")

In [None]:
user_input("元気ですか？暑くなって来ましたね？")

In [None]:
user_input("バカ")

In [None]:
user_input("君はバカ？")

In [None]:
user_input("今はどこに住んでいるのかな？")

In [None]:
user_input("オリンピックが東京で開催されるの知ってる？")

In [None]:
user_input("海はいいです")