In [1]:
#获取数据
import helper

data_dir = './data/simpsons/moes_tavern_lines.txt'
text = helper.load_data(data_dir)

text = text[81:]

In [2]:
#探索数据
view_sentence_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.248091603053435
Number of lines: 4257
Average number of words in each line: 11.50434578341555

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




In [3]:
type(scenes)

list

In [4]:
#预处理函数
#查询表
import numpy as np
import problem_unittests as tests
from collections import Counter

def create_lookup_tables(text):
    #创建查询表
    vocab = set(text)
    vocab_to_int = {w: i for i, w in enumerate(vocab)}
    int_to_vocab = dict(enumerate(vocab))
    return (vocab_to_int, int_to_vocab)

Instructions for updating:
Use the retry module or similar alternatives.


In [5]:
#标记符号的字符串
def token_lookup():
    
    
    token = {'.': '||period||',
             ',': '||comma||',
             '"': '||quotationmark||',
             ';': '||semicolon||',
             '?': '||questionmark||',
             '!': '||exclamationmark||',
             '(': '||leftparentheses||',
             ')': '||rightparentheses||',
             '--': '||dash||',
             '\n': '||return||'}
    return token



In [6]:
#预处理并保存所有数据
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [7]:
#checkpoint
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

In [8]:
#创建神经网络
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

#检查TensorFlow版本
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

#检查GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.7.0


  if sys.path[0] == '':


In [9]:
def get_inputs():

    Input = tf.placeholder(tf.int32, [None, None], name = 'input')
    Targets = tf.placeholder(tf.int32, [None, None], name='Targets')
    LearningRate = tf.placeholder(tf.float32, name='LearningRate')
    return Input, Targets, LearningRate



In [10]:
def get_init_cell(batch_size, rnn_size, n_layers=2):

    
    def make_lstm(rnn_size):
        return tf.contrib.rnn.BasicLSTMCell(rnn_size)
    
    cell = tf.contrib.rnn.MultiRNNCell([ make_lstm(rnn_size) for _ in range(n_layers)])
    
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    return (cell, initial_state)


In [11]:
#词嵌入
def get_embed(input_data, vocab_size, embed_dim):

    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim),-1,1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed



In [12]:
def build_rnn(cell, inputs):
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    return (outputs, final_state)

In [13]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):

    inputs = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, inputs)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn = None)
    return logits, final_state


In [14]:
def get_batches(int_text, batch_size, seq_length):
    num_batches = len(int_text) // (batch_size * seq_length)
    int_text = int_text[:(seq_length * num_batches * batch_size +1)]
    input_batch = [int_text[i * seq_length : (i* seq_length +seq_length) ] for i in range(batch_size * num_batches)]
    input_targets = [int_text[i * seq_length +1 : (i* seq_length +seq_length+1)] for i in range(batch_size * num_batches)]
    inputs = []
    target = []
    output = []
    for i in range(num_batches):
        inputs = []
        target = []
        for j in range(batch_size):
            inputs.append(input_batch[j * num_batches + i])
            target.append(input_targets[j * num_batches + i])
        output.append([inputs, target])
    return np.array(output)


In [15]:
num_epochs = 100
batch_size = 128
rnn_size = 256
embed_dim = 200
seq_length = 30
learning_rate = 0.01
show_every_n_batches = get_batches(int_text, batch_size, seq_length).shape[0]

save_dir = './save'

In [27]:

from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    probs = tf.nn.softmax(logits, name='probs')

    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    optimizer = tf.train.AdamOptimizer(lr)

    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [28]:

batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/17   train_loss = 8.822
Epoch   1 Batch    0/17   train_loss = 6.602
Epoch   2 Batch    0/17   train_loss = 6.182
Epoch   3 Batch    0/17   train_loss = 6.010
Epoch   4 Batch    0/17   train_loss = 5.895
Epoch   5 Batch    0/17   train_loss = 5.750
Epoch   6 Batch    0/17   train_loss = 5.660
Epoch   7 Batch    0/17   train_loss = 5.464
Epoch   8 Batch    0/17   train_loss = 5.290
Epoch   9 Batch    0/17   train_loss = 5.137
Epoch  10 Batch    0/17   train_loss = 5.080
Epoch  11 Batch    0/17   train_loss = 4.908
Epoch  12 Batch    0/17   train_loss = 4.804
Epoch  13 Batch    0/17   train_loss = 4.672
Epoch  14 Batch    0/17   train_loss = 4.497
Epoch  15 Batch    0/17   train_loss = 4.345
Epoch  16 Batch    0/17   train_loss = 4.194
Epoch  17 Batch    0/17   train_loss = 4.081
Epoch  18 Batch    0/17   train_loss = 3.914
Epoch  19 Batch    0/17   train_loss = 3.818
Epoch  20 Batch    0/17   train_loss = 3.699
Epoch  21 Batch    0/17   train_loss = 3.560
Epoch  22 

In [22]:
helper.save_params((seq_length, save_dir))

In [23]:
#checkpoint
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [10]:
def get_tensors(loaded_graph):

    inputtensor = loaded_graph.get_tensor_by_name("input:0")
    initialstatetensor = loaded_graph.get_tensor_by_name("initial_state:0") 
    finalstatetensor = loaded_graph.get_tensor_by_name("final_state:0") 
    probstensor = loaded_graph.get_tensor_by_name("probs:0")
    return  (inputtensor, initialstatetensor, finalstatetensor, probstensor)


In [9]:
def pick_word(probabilities, int_to_vocab):

    index = np.argmax(probabilities)
    predictword = []
    for i in range(3):
        index = np.argmax(probabilities)
        probabilities[index] = 0
        predictword.append(int_to_vocab[index])
    return predictword[np.random.randint(0, 3)]


In [26]:
#生成剧本
gen_length = 200
prime_word = 'moe_szyslak'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    for n in range(gen_length):
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

moe_szyslak: homer, is them peanuts is made in the night there that you've need any money, it was." the bartender for him. you know you're too guys ain't so a" bow! a bachelorette sister and i sound of the pool radishes like how to give this treasure. let this makin' got close to go down and i don't the twenty any of all. i can't use you to bluff my way through. okay. uh.. so a little dank! i can't. to you a date with homer!(getting chuckle) then you take a drink, homer? a hundred bucks. if i knew we pulled a man in sight could a snake-handler

i had a duff, huh. that's so moe. i could hit that crawlin' here to least we like? i'm actually state could to be right here in business. i mean, maybe he's tired. can you all have to beat to go with the hammer-- let me feel anyway? even you need! take it. but you've got a guy
