##Using LSTM Cells in a recurrent neural network, this will generate chatbot profiles for each primary southpark character

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [11]:
df = pd.read_csv("../input/All-seasons.csv")

In [12]:
lines = df["Line"]
characters = df["Character"]
episodes = df["Episode"]
charlines = "("+characters+") " + lines
text = ""
for line in charlines:
    text += line

In [54]:
token_dict = { 
    '!': '||EXCLAIMATIONMARK||',
    '?': '||QUESTIONMARK||',
    '--': '||DOUBLEDASH||',
    '"': '||DOUBLEQUOTE||',
    ',': '||COMMA||',
    '.': '||PERIOD||',
    ';': '||SEMICOLON||',
    '\n': '||NEWLINE||',
    '(': '||OPENPAREN||',
    ')': '||CLOSEPAREN||',
    #'+': '||PLUS||',
    '&': '||AMPERSAND||',
    ':': '||COLON||',
    #'\'': '||APOSTROPHE||',
    #'-': '||DASH||',
}

for key,token in token_dict.items() :
    text = text.replace(key, ' {} '.format(token)) 
    
text = text.lower().split()

In [60]:
vocab = set(text)
vocab_to_int = {w:i for i,w in enumerate(vocab)}
int_to_vocab = {i:w for w,i in vocab_to_int.items()}
int_text = [vocab_to_int[word] for word in text]

### Save preprocessed data to file

In [62]:
pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

## Checkpoint

In [63]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

int_text, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocess.p', mode='rb'))

In [67]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    # TODO: Implement Function
    Input = tf.placeholder(tf.int32, shape=(None, None), name="input")
    Targets = tf.placeholder(tf.int32, shape=(None, None), name="targets")
    LearningRate = tf.placeholder(tf.float32, name="learning_rate")
    return Input, Targets, LearningRate

In [69]:
lstm_layers = 2
keep_prob = .8
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size, activation=tf.sigmoid)
    lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    rnn = tf.contrib.rnn.MultiRNNCell([lstm] * lstm_layers)
    state = tf.identity(rnn.zero_state(batch_size, tf.float32), name='initial_state')
    
    return (rnn, state)

In [110]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    # TODO: Implement Function
    embedding_shape = (vocab_size, embed_dim)
    embedding = tf.Variable(tf.random_uniform(embedding_shape, -1, 1, dtype=tf.float32), name="embedding")
    embed = tf.nn.embedding_lookup(embedding, input_data, name="embedding_lookup")
    return embed

In [105]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    # TODO: Implement Function
    outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    state = tf.identity(state, "final_state")
    return outputs, state

In [106]:
def build_nn(cell, rnn_size, input_data, vocab_size):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :return: Tuple (Logits, FinalState)
    """
    # TODO: Implement Function
    embedded = get_embed(input_data, vocab_size, 256)
    logits, finalstate = build_rnn(cell=cell, inputs=embedded)
    
    weights_init = tf.truncated_normal_initializer(stddev=0.1)
    bias_init = tf.zeros_initializer()
    logits = tf.contrib.layers.fully_connected(
            inputs=logits,
            num_outputs=vocab_size, 
            activation_fn=None,
            weights_initializer=weights_init, 
            biases_initializer=bias_init)
    return logits, finalstate

In [107]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    # TODO: Implement Function
    
    n_batches = len(int_text)//(batch_size*seq_length)
    
    inputs = np.array(int_text[:n_batches*(batch_size*seq_length)])
    targets = np.array(int_text[1:n_batches*(batch_size*seq_length)+1])
    #print("inttext: ", int_text)
    #print("inputs: ", inputs.reshape(batch_size, -1))
    input_batches = np.split(inputs.reshape(batch_size, -1), n_batches, 1)
    target_batches = np.split(targets.reshape(batch_size, -1), n_batches, 1)
    
    output = np.array(list(zip(input_batches, target_batches)))
    #print("output:: \n", output[:2])
    return output

In [111]:
# Number of Epochs
num_epochs = 50
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 500
# Sequence Length
seq_length = 100
# Learning Rate
learning_rate = 0.02
# Show stats for every n number of batches
show_every_n_batches = 2

#LSTM_layers
lstm_layers = 1
keep_prob = 1

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save'

## Build the Graph

In [112]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)
    
    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train the Graph

In [113]:
batches = get_batches(int_text, batch_size, seq_length)
total_loss = []
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        epoch_collection = []
        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
            
            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                epoch_collection.append(train_loss)
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))
        if epoch_collection != []:
            total_loss += epoch_collection

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/1747   train_loss = 5.197
Epoch   0 Batch    2/1747   train_loss = 4.196
Epoch   0 Batch    4/1747   train_loss = 5.533
Epoch   0 Batch    6/1747   train_loss = 4.324
Epoch   0 Batch    8/1747   train_loss = 3.082
Epoch   0 Batch   10/1747   train_loss = 2.953
Epoch   0 Batch   12/1747   train_loss = 2.943
Epoch   0 Batch   14/1747   train_loss = 2.727
Epoch   0 Batch   16/1747   train_loss = 2.564
Epoch   0 Batch   18/1747   train_loss = 2.388
Epoch   0 Batch   20/1747   train_loss = 2.269
Epoch   0 Batch   22/1747   train_loss = 2.172
Epoch   0 Batch   24/1747   train_loss = 2.112
Epoch   0 Batch   26/1747   train_loss = 1.978


KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt

x, y = [], []
for i, loss in enumerate(total_loss):
    x += [i]
    y += [loss]

plt.plot(x,y)
plt.show()

In [None]:
# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

# Checkpoint

In [None]:
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [None]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    # TODO: Implement Function
    inputs = loaded_graph.get_tensor_by_name("input:0")
    initial_state = loaded_graph.get_tensor_by_name("initial_state:0")
    final_state = loaded_graph.get_tensor_by_name("final_state:0")
    probs = loaded_graph.get_tensor_by_name("probs:0")
    
    return inputs, initial_state, final_state, probs

In [None]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    # TODO: Implement Function
    max_i = 0
    for i in range(len(probabilities)):
        if probabilities[i] > probabilities[max_i]:
            max_i = i
    first_choice = int_to_vocab[max_i]
    
    second_i = 0
    for i in range(len(probabilities)):
        if probabilities[i] > probabilities[second_i] and probabilities[i] < probabilities[max_i]:
            second_i = i
    second_choice = int_to_vocab[second_i]
    
    chance = random.random()
    
    final_choice = first_choice if chance > .4 else second_choice
    #print(probabilities)
    #print(chance, ": ", first_choice, "[", probabilities[max_i] ,"] || ", second_choice, "[", probabilities[second_i] ,"] : " ,final_choice)
    return final_choice

In [None]:
gen_length = 300
# homer_simpson, moe_szyslak, or Barney_Gumble
#prime_word = 'moe_szyslak'
#prime_word = 'homer_simpson'
prime_word = 'barney_gumble'

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)