In [None]:
import sys
import re
import pickle
import numpy as np
import random
import os

In [None]:
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(R"\d")

In [None]:
def basic_tokenizer(sentence):
    """ Split sentence into list of tokens """
    words = []
    for space_separated_item in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_item))
    return [w for w in words if w] # if w removes the ""

def get_vocab(tokenized, max_vocab_size):
    """
    Get vocab_list, vocab_dict and rev_vocab_dict given the
    tokenized sentences.
    """
    # Replace word count
    vocab = {}
    for sentence in tokenized:
        for word in sentence:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    if len(vocab_list) > max_vocab_size:
        vocab_list = vocab_list[:max_vocab_size]

    # Get vocab dict (word -> token) and rev dict (token -> word)
    vocab_dict = dict([(x,y) for (y,x) in enumerate(vocab_list)])
    rev_vocab_dict = {v: k for k, v in vocab_dict.items()}

    return vocab_list, vocab_dict, rev_vocab_dict

def sentence_to_token_ids(sentence, vocab_dict, target_lang,
    normalize_digits=True):
    """
    Convert a single sentence of words to token ids. If it is the target
    language, we will append an EOS token to the end.
    """
    if not normalize_digits:
        # replace words not in vocab_dict with UNK_ID
        tokens = [vocab_dict.get(w, UNK_ID) for w in sentence]
    else:
        tokens = [vocab_dict.get(_DIGIT_RE.sub(b"0", w), UNK_ID)
            for w in sentence]

    # Append EOS token if target langauge sentence

    return tokens


def data_to_token_ids(tokenized, vocab_dict, normalize_digits=True):
    """
    Convert tokens into ids used vocab_dict and normalize all digits
    to 0.
    """
    data_as_tokens = []
    #max_len = max(len(sentence) for sentence in tokenized) + 1 # +1 for EOS
    #max_len=max_seq_len+1
    for sentence in tokenized:
        token_ids = sentence_to_token_ids(sentence, vocab_dict, normalize_digits)
        data_as_tokens.extend(token_ids)
    return data_as_tokens

def process_data(datafile, max_vocab_size):
    """
    Read the sentences from our datafiles.
    """
    with open(datafile, 'rb') as f:
        sentences = pickle.load(f)

    # Split into tokens
    tokenized = []
    for i in range(len(sentences)):
        tokenized.append(basic_tokenizer(sentences[i]))

    # Get vocab information
    vocab_list, vocab_dict, rev_vocab_dict = get_vocab(tokenized,
        max_vocab_size)

    # Convert data to token ids
    data_as_tokens= data_to_token_ids(tokenized, vocab_dict, normalize_digits=True)
    
    data_mixed=[]
    for word in data_as_tokens:
        if word==vocab_dict["the"] or word==vocab_dict["a"]:
            data_mixed.append(vocab_dict["the"] if random.random()<0.5 else vocab_dict["a"])
        else:
            data_mixed.append(word)
            
    pickle.dump((data_as_tokens, vocab_dict,  rev_vocab_dict), open('preprocess.p', 'wb'))
    
    return data_as_tokens, data_mixed,vocab_dict, rev_vocab_dict

In [None]:
def compare_a_the(x_text,y_text):
    theid=vocab_to_int["the"]
    aid=vocab_to_int["a"]
    total=0
    mix=0
    for i,word in enumerate(x_text):
        if word==theid or word==aid:
            total+=1
            if x_text[i]!=y_text[i]:
                mix+=1
    return mix,total
        

In [None]:
int_label,int_text, vocab_to_int, int_to_vocab = \
        process_data('test.p', max_vocab_size=8000)

In [None]:
mix,total=compare_a_the(int_label,int_text)
mix/total

In [None]:
[int_to_vocab[i] for i in int_text[:10]]

In [None]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

In [None]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    # TODO: Implement Function
    inputs=tf.placeholder(tf.int32,[None,None],name='input')
    targets=tf.placeholder(tf.int32,[None,None],name='targets')
    learning_rate=tf.placeholder(tf.float32,name='learning_rate')
    return inputs, targets, learning_rate

In [None]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    # TODO: Implement Function
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm])
    return cell

In [None]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    # TODO: Implement Function
    embeddings = tf.Variable(tf.random_normal([vocab_size, embed_dim], stddev=0.1), name='embeddings')
    embed = tf.nn.embedding_lookup(embeddings, input_data, name='embed')
    return embed

In [None]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    # TODO: Implement Function
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    return (outputs, final_state)

In [None]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (Logits, FinalState)
    """
    # TODO: Implement Function
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    logits = tf.contrib.layers.fully_connected(outputs,vocab_size,activation_fn=None)
    return (logits, final_state)

In [None]:
def get_batches(int_text,label_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    # TODO: Implement Function
    
    num_batches = len(int_text) // (batch_size * seq_length)
    xtext = int_text[:num_batches * batch_size * seq_length]
    ytext = label_text[:num_batches * batch_size * seq_length]
    
        
    xreshape=np.reshape(xtext,[batch_size,-1])
    yreshape=np.reshape(ytext,[batch_size,-1])
    batches = []
    
    for i in range(0,xreshape.shape[1],seq_length):
        xx=xreshape[:,i:i+seq_length]
        yy=yreshape[:,i:i+seq_length]
        batches.append([xx,yy])
        
    return np.asarray(batches)

In [None]:
a=get_batches(int_text,int_label, 3, 5)

In [None]:
a.shape

In [None]:
# Number of Epochs
num_epochs = 100
# Batch Size
batch_size = 4
# RNN Size
rnn_size = 100
# Embedding Dimension Size
embed_dim = 100
# Sequence Length
seq_length = 10
# Learning Rate
learning_rate = 0.001
# Show stats for every n number of batches
show_every_n_batches = 2

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save'

pickle.dump((seq_length, save_dir), open('params.p', 'wb'))

In [None]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell= get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
split_index=int(0.8*len(int_text))
train_source = int_text[:split_index]
valid_source = int_text[split_index:]

train_label= int_label[:split_index]
valid_label= int_label[split_index:]

print(len(valid_label))
train_batches = get_batches(train_source,train_label, batch_size, seq_length)
valid_batches=get_batches(valid_source,valid_label, 5, seq_length)
valid_x=valid_batches[0][0]
valid_y=valid_batches[0][1]
#print(valid_x)
#print(valid_y)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):

        for batch_i, (x, y) in enumerate(train_batches):
            feed = {
                input_text: x,
                targets: y,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(train_batches) + batch_i) % show_every_n_batches == 0:
                feed_valid={
                input_text: valid_x,
                targets: valid_y}
                valid_loss,= sess.run([cost], feed_valid)
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}  valid_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(train_batches),
                    train_loss,
                    valid_loss))
                
    input_sentence = ["I","am","the","man","!","I","like","the","big","man"]

    input_sentence = [vocab_to_int.get(word, vocab_to_int[_UNK]) for word in input_sentence]
    batch_shell = np.zeros((batch_size, seq_length))
    batch_shell[0] = input_sentence
    chatbot_logits = sess.run(logits, {input_text: batch_shell})[0]
    print('Input')
    print('  Word Ids:      {}'.format([i for i in input_sentence]))
    print('  Input Words: {}'.format([int_to_vocab[i] for i in input_sentence]))
    print('\nPrediction')
    print('  Word Ids:      {}'.format([i for i in np.argmax(chatbot_logits, 1)]))
    print('  Chatbot Answer Words: {}'.format([int_to_vocab[i] for i in np.argmax(chatbot_logits, 1)]))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

In [1]:
import tensorflow as tf
import numpy as np
import pickle

int_text, vocab_to_int, int_to_vocab= pickle.load(open('preprocess.p', mode='rb'))
seq_length, load_dir = pickle.load(open('params.p', mode='rb'))

In [5]:
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

In [6]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    # TODO: Implement Function
    return loaded_graph.get_tensor_by_name("input:0"), \
           loaded_graph.get_tensor_by_name("logits:0"),\
           loaded_graph.get_tensor_by_name("probs:0")

In [14]:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, logits, probs = get_tensors(loaded_graph)

    input_sentence = ["I","am","the","man","!","I","like","the","big","man"]
    print(seq_length)
    input_sentence = [vocab_to_int.get(word, vocab_to_int[_UNK]) for word in input_sentence]  
    batch_shell = np.zeros((7, seq_length))
    batch_shell[0] = input_sentence
    chatbot_logits = sess.run(logits, {input_text: batch_shell})[0]
    print(chatbot_logits.shape)
    print('Input')
    print('  Word Ids:      {}'.format([i for i in input_sentence]))
    print('  Input Words: {}'.format([int_to_vocab[i] for i in input_sentence]))
    print('\nPrediction')
    print('  Word Ids:      {}'.format([i for i in np.argmax(chatbot_logits, 1)]))
    print('  Chatbot Answer Words: {}'.format([int_to_vocab[i] for i in np.argmax(chatbot_logits, 1)]))

INFO:tensorflow:Restoring parameters from ./save
10
(2, 7, 100)
Input
  Word Ids:      [4, 16, 14, 3, 3, 4, 36, 14, 3, 3]
  Input Words: ['I', 'am', 'the', '_UNK', '_UNK', 'I', 'like', 'the', '_UNK', '_UNK']

Prediction
  Word Ids:      [array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1], dtype=int64), array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1], dtype=int64)]


TypeError: unhashable type: 'numpy.ndarray'