In [1]:
import tensorflow as tf
import numpy as np
import re
import time
import config
from seq2seq_model import Chatbot
from tqdm import tqdm

  from ._conv import register_converters as _register_converters


### Data 前処理

In [2]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [3]:
idline2 = {}
for line in lines :
    _line = line.split(' +++$+++ ')
    if len(_line) == 5 :
        #  idline2 は辞書型なので　dline2[_line[0]　をkey として、そのkey に対応する　_line[4]　をvalue　として代入している
        idline2[_line[0]] = _line[4]

In [4]:
#　conversations の 最後の対話部分を抜き出している

conversations_ids = []
for conversation in conversations[:-1]:
     # この処理よく分からない。
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    
    conversations_ids.append(_conversation.split(","))

In [5]:
# conversation_ids を question と answer に分ける作業
# conversation は 　['L200', 'L201', 'L202', 'L203']　　こんな感じ
#  idline2[conversation[i]] は　dict dline2 の辞書のkey　を指定しているので value  例えば　"Well, there's someone I think might be --",　が リストに格納される

questions = []
answers = []

for conversation in conversations_ids :
    for i in range(len(conversation) -1) :
        questions.append(idline2[conversation[i]])
        answers.append(idline2[conversation[i+1]])

In [6]:
# 正規表現をかます

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

cleand_questions = []
for question in questions:
    cleand_questions.append(clean_text(question))
    
cleand_answers = []
for answer in answers:
    cleand_answers.append(clean_text(answer))

In [7]:
# cleand_questions と cleand_answers  を最大 len(25) 以内の長さに収めている

short_questions = []
short_answers = []
i = 0
for question in cleand_questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(cleand_answers[i])
    i += 1
cleaned_questions = []
cleaned_answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        cleaned_answers.append(answer)
        cleaned_questions.append(short_questions[i])
    i += 1

In [8]:
# word2_count   cleaned_questions    cleand_answers の 全ての　vocab の出現頻度を数えている

word2count = {}
for question in cleaned_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for ansswer in cleaned_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [9]:
threshold_questions = 15
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 15
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [10]:
# 出現頻度　15回以上のword を new_vocab に追加する

threshold = 15
new_vocab = []

for key in word2count.keys() :
    if word2count[key] >= threshold :
        new_vocab.append(key)  

In [11]:
# list 同士の足し算

new_vocab = ['<PAD>', '<GO>', '<UNK>', '<EOS>'] + new_vocab

In [12]:
word_to_id = {word:i for i, word in enumerate(new_vocab)}

In [13]:
type(word_to_id)

dict

In [14]:
id_to_word = {i:word for i, word in enumerate(new_vocab)}

### Data Encode処理

In [15]:
encoded_data = []

def data_encode(data, word_to_id) :

    for i in range(len(data)) :

        encoded_line = []
        words = cleaned_questions[i].split()     # list の word 毎に　区切って入れる。

        for word in words :

            if word not in word_to_id.keys() :
                encoded_line.append(word_to_id['<UNK>'])

            else :
                encoded_line.append(word_to_id[word])


        encoded_data.append(encoded_line)
    
    
    return np.array(encoded_data)

In [16]:
encoded_questions = data_encode(cleaned_questions, word_to_id)
encoded_answers = data_encode(cleaned_answers, word_to_id)

In [17]:
#  [word_to_id['<EOS>']]   と書く事で　int を list に出来る  　そして　これで　target データの末尾に　<EOS> を追加出来た。
encoded_answers = [sequence + [word_to_id['<EOS>']] for sequence in encoded_answers]

### Chatbot モデル作成

In [18]:
model = Chatbot(config.LEANING_RATE, 
                config.BATCH_SIZE, 
                config.ENCODING_EMBED_SIZE, 
                config.DECODING_EMBED_SIZE, 
                config.RNN_SIZE, 
                config.NUM_LAYERS,
                len(new_vocab), 
                word_to_id, 
                config.CLIP_RATE)

In [19]:
session = tf.Session()
session.run(tf.global_variables_initializer())
# saver 最後に学習したモデルの保存を行う。　　max_to_keep　保持する最新のチェックポイントの最大数。　デフォルトは５である。
saver = tf.train.Saver(max_to_keep=10)

### model 学習 

In [20]:
def apply_pad_que(batch_of_sequences, word_to_id) :
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence +  [word_to_id['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

def apply_pad_ans(batch_of_sequences, word_to_id) :
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word_to_id['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

In [21]:
word_to_id['<PAD>']

0

In [22]:
print(len(encoded_questions)// 64)
print(len(encoded_answers)//64)

2423
4847


In [24]:
def get_accuracy(labels, predicts):
    return np.mean(np.equal(labels, predicts))

In [25]:
#  len(X_batch[0] , len(y_batch[0]   はpaddingで固定されているので同じ値が入る。その値を持つ６４個の要素を一つのリスト内に入れている


for i in range(1) :
    epoch_accuracy = []
    epoch_loss = []
    for ii in tqdm(range(len(encoded_questions[:64]) // config.BATCH_SIZE)) :
        
        starting_id = ii * config.BATCH_SIZE
        
        batch_of_questions = encoded_questions[starting_id : starting_id + config.BATCH_SIZE]
        # padding処理した　encoded_questions
        X_batch = apply_pad_que(batch_of_questions, word_to_id)
        batch_of_answers = encoded_answers[starting_id : starting_id + config.BATCH_SIZE]
        # padding処理した　encoded_answers
        y_batch = apply_pad_ans(batch_of_answers, word_to_id)
        
        
        feed_dict = {model.inputs : X_batch,
                               model.targets : y_batch,
                               model.keep_probs : config.KEEP_PROBS,
                                model.encoder_seq_len : [len(X_batch[0])] * config.BATCH_SIZE,
                                model.decoder_seq_len : [len(y_batch[0])] * config.BATCH_SIZE
                                }
        
        # Chatbot class で定義した　loss, opt, predictions　を定義している
        cost, _, preds = session.run([model.loss, model.opt, model.predictions], feed_dict=feed_dict)
        print(np.array(y_batch).shape)
        print(np.array(preds).shape)
        
        if len(np.array(y_batch[0])) == len(np.array(preds[0])) :
            epoch_accuracy.append(get_accuracy(np.array(y_batch), np.array(preds)))
            
        elif len(np.array(y_batch[0])) <= len(np.array(preds[0])) :
            y_batch_ = np.empty((64, len(np.array(preds[0]))))
            for i, sequence in enumerate(np.array(y_batch)) :
                max_sequence_length = len(preds[0])
                pad_width = (0, max_sequence_length -len(sequence))
                y_batch_[i] = np.pad(y_batch[i], pad_width, 'constant', constant_values=0)
                

            print('y_batch_', y_batch_.shape)
            epoch_accuracy.append(get_accuracy(np.array(y_batch_), np.array(preds)))
#             max_sequence_length = len(np.array(preds[0]))
#             y_batch = [sequence +  [word_to_id['<PAD>']] * (max_sequence_length - len(sequence) for sequence in np.array(y_batch))]

#             for i, sequence in enumerate(np.array(y_batch)) :
#                 max_sequence_length = len(preds[0])
#                 y_batch[i] = sequence + [word_to_id['<PAD>']] * max_sequence_length - len(sequence)
                                                            
#             epoch_accuracy.append(tf.metrics.accuracy(np.array(y_batch), np.array(preds)))
                                                            
       
        elif len(np.array(preds[0])) <= len(np.array(y_batch[0])) :
            preds_ = np.empty((64,len(np.array(y_batch[0] ))))
            for i, sequence in enumerate(np.array(preds)) :
                max_sequence_length = len(y_batch[0])
                pad_width = (0, max_sequence_length -len(sequence))
                preds_[i] = np.pad(preds[i], pad_width, 'constant', constant_values=0)
                
            print('preds_', preds_.shape)
            epoch_accuracy.append(get_accuracy(np.array(y_batch), np.array(preds_)))
                 
#             max_sequence_length = len(np.array(y_batch[0]))
#             preds = [sequence + [word_to_id['<PAD>']] * (max_sequence_length - len(sequence) for sequence in np.array(preds))]
            
#             for i, sequence in enumerate(np.array(preds)) :
#                 max_sequence_length = len(y_batch[0])
#                 preds[i] = sequence + [word_to_id['<PAD>']] * max_sequence_length - len(sequence)            
        
        
            
    epoch_loss.append(cost)
        
    print('EPOCH: {}/{}'.format(i, config.EPOCHS),'Epoch accuracy: {}'.format(np.mean(epoch_accuracy)),\
          'Epoch loss: {}'.format(np.mean(epoch_loss)))
    
    
    saver.save(session, "checkpoint/chatbot_{}.ckpt".format(i))
    

100%|██████████| 1/1 [00:04<00:00,  4.06s/it]

(64, 26)
(64, 26)
EPOCH: 0/2 Epoch accuracy: 0.001201923076923077 Epoch loss: 8.258495330810547





In [26]:
def convert_string_to_int(question, word_to_id) :
    question = clean_text(question)
    return [word_to_id.get(word, word_to_id['<UNK>']) for word in question.split()]

In [29]:
#　Chatbot 対話部分
from seq2seq_model import Chatbot

while (True) :
    
    question = input("your word: ")
    if question == 'Goodbye' :
        break
        
    question = convert_string_to_int(question, word_to_id)
    question = question +[word_to_id['<PAD>']] * (25 - len(question))
    fake_batch = np.zeros((config.BATCH_SIZE, 25))
    fake_batch[0] = question
    
    prediceted_answer = session.run(inference_output, feed_dict={
                                                                            inputs: fake_batch, 
                                                                            keep_prob : 0.5
    })[0]
   
    answer = ''
    # 第二引数は 配列の指定
    for i in np.argmax(predicted_answer, 1) :
        
        if id_to_word[i] == 'i' :
            token = 'I'
        elif id_to_word[i] == '<EOS>' :
            token = '.'
        elif id_to_word[i] == '<UNK>' :
            token = 'unk'
        else :
            token = ' ' + id_to_word[i]
        answer += token
        if token == '.' :
            break
            
    print('Chatbot:' + answer )

your word: Hi


NameError: name 'inference_output' is not defined