In [1]:
import pandas as pd 
import numpy as np 
import re
import sys

from nltk import word_tokenize
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.models import model_from_json
from keras.layers import Input, Activation, Dense, Dropout
from keras.layers import LSTM, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
quotables = pd.read_csv('author-quote.txt', delimiter='\t', header=None)
quotables = quotables.rename(columns={0:'author', 1:'quote'})

In [3]:
quotables.sample(20)

Unnamed: 0,author,quote
27408,Paul Gascoigne,I don't really do pranks any more. I have a la...
4887,Brion James,"You know, stand-up comedy is where I pretty mu..."
17754,Jim Capaldi,"Traffic was very, very free. It was great."
30507,Saint Teresa of Avila,"God has been very good to me, for I never dwel..."
34103,Tryon Edwards,Science has sometimes been said to be opposed ...
5005,Bruce Babbitt,"I look back on it, yeah, I'm in a much worse f..."
27880,Peter Gallagher,"I feel like I won the Lotto, as far as that sh..."
7237,Coco Chanel,"Since everything is in our heads, we had bette..."
33523,Thornton Wilder,"Those who are silent, self-effacing and attent..."
19070,John Locke,"As people are walking all the time, in the sam..."


In [4]:
len(list(quotables.author.unique()))

2297

In [5]:
quotables.loc[quotables.author == 'Anne Frank']

Unnamed: 0,author,quote
2346,Anne Frank,Whoever is happy will make others happy too.
2347,Anne Frank,"Despite everything, I believe that people are ..."
2348,Anne Frank,Think of all the beauty still left around you ...
2349,Anne Frank,"The best remedy for those who are afraid, lone..."
2350,Anne Frank,Everyone has inside of him a piece of good new...
2351,Anne Frank,Parents can only give good advice or put them ...
2352,Anne Frank,"Laziness may appear attractive, but work gives..."
2353,Anne Frank,I don't think of all the misery but of the bea...
2354,Anne Frank,How wonderful it is that nobody need wait a si...
2355,Anne Frank,In spite of everything I still believe that pe...


In [6]:
quotables['len_quotes'] = quotables.quote.map(lambda s: len(s))

In [7]:
quotes = list(quotables.quote + '\n')

In [8]:
removed_char = ['#', '$', '%', '(', ')', '=', ';' ,':',  '*', '+', '£' , '—','’']  
quotes_cleaned = []

for quote in quotes: 
    # remove unused character
    for s_char in removed_char:
        quote = quote.replace(s_char, ' ')
    
    # remove white space
    pattern = re.compile(r'\s{2,}')
    quote = re.sub(pattern, ' ', quote)

    quotes_cleaned.append(quote)

text = ' '.join(quotes_cleaned)
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [9]:
maxlen = 15
step = 6
sentences = []
next_chars = []

for quote in quotes_cleaned:
    for i in range(0, len(quote) - maxlen, step):
        sentences.append(quote[i: i + maxlen])
        next_chars.append(quote[i + maxlen])
    sentences.append(quote[-maxlen:])
    next_chars.append(quote[-1])
print('nb sequences:', len(sentences))

nb sequences: 753142


In [10]:
sentences

['If you live to ',
 ' live to be a h',
 'to be a hundred',
 'a hundred, I wa',
 'red, I want to ',
 ' want to live t',
 'to live to be a',
 'e to be a hundr',
 'e a hundred min',
 'ndred minus one',
 'minus one day s',
 'one day so I ne',
 'y so I never ha',
 ' never have to ',
 ' have to live w',
 'to live without',
 'e without you.\n',
 "Promise me you'",
 "e me you'll alw",
 "ou'll always re",
 'always remember',
 " remember You'r",
 "ber You're brav",
 "u're braver tha",
 'raver than you ',
 'than you believ',
 'ou believe, and',
 'ieve, and stron',
 'and stronger th',
 'ronger than you',
 ' than you seem,',
 'you seem, and s',
 'em, and smarter',
 'd smarter than ',
 'ter than you th',
 'han you think.\n',
 'Did you ever st',
 'u ever stop to ',
 ' stop to think,',
 'to think, and f',
 'nk, and forget ',
 'd forget to sta',
 'et to start aga',
 'o start again?\n',
 'Organizing is w',
 'zing is what yo',
 's what you do b',
 ' you do before ',
 'o before you do',
 're you do somet

In [11]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [12]:
x.shape, y.shape, len(chars)

((753142, 15, 73), (753142, 73), 73)

In [18]:
## Model 
print('Build model...')
input_sequences = Input((maxlen, len(chars)) , name="input_sequences")
lstm = Bidirectional(LSTM(256, return_sequences= True, input_shape=(maxlen, len(chars))), name = 'bidirectional')(input_sequences)
lstm = Dropout(0.1, name = 'dropout_bidirectional_lstm')(lstm)
lstm = LSTM(64, input_shape=(maxlen, len(chars)), name = 'lstm')(lstm)
lstm = Dropout(0.1,  name = 'drop_out_lstm')(lstm)

dense = Dense(15 * len(chars), name = 'first_dense')(lstm)
dense = Dropout(0.1,  name = 'drop_out_first_dense')(dense)
dense = Dense(5 * len(chars), name = 'second_dense')(dense)
dense = Dropout(0.1,  name = 'drop_out_second_dense')(dense)
dense = Dense(len(chars), name = 'last_dense')(dense)

next_char = Activation('softmax', name = 'activation')(dense)

model = Model([input_sequences], next_char)
model.compile(optimizer='adam', loss='categorical_crossentropy')


Build model...


In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_sequences (InputLayer) (None, 15, 73)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 15, 512)           675840    
_________________________________________________________________
dropout_bidirectional_lstm ( (None, 15, 512)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                147712    
_________________________________________________________________
drop_out_lstm (Dropout)      (None, 64)                0         
_________________________________________________________________
first_dense (Dense)          (None, 1095)              71175     
_________________________________________________________________
drop_out_first_dense (Dropou (None, 1095)              0         
__________

In [20]:
model.fit([x], y,
         batch_size=128,
          epochs= 15
         )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x15c3891d0>

In [61]:
model.fit([x], y,
         batch_size=2048,
          epochs= 2
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x162988f60>

In [88]:
model.fit([x], y,
         batch_size=1024,
          epochs= 2
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15c389208>

In [21]:
two_first_words = [bigram for bigram in [' '.join(word_tokenize(quote)[:2]) for quote in quotes] if len(bigram) <= maxlen]

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [22]:
def generate_quote(sentence = None, diversity = 0.8):
    
    if not sentence: ## if input is null then sample two first word from dataset
        random_index = np.random.randint(0, len(two_first_words))
        sentence = two_first_words[random_index]
        
    if len(sentence) > maxlen:
        sentence = sentence[-maxlen:]
    elif len(sentence) < maxlen:
        sentence = ' '*(maxlen - len(sentence)) + sentence
        
    generated = ''
    generated += sentence
    sys.stdout.write(generated)
    
    next_char = 'Empty'
    total_word = 0 
    
    max_word = 15
    
    while ((next_char not in ['\n', '.']) & (total_word <= 500)):
    
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        if next_char == ' ':
           total_word += 1
        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

In [63]:
generate_quote()

        I couldn't walk in the church.


In [64]:
generate_quote()

         You don't have to take on my family.


In [65]:
generate_quote()

      The legal songs on the same credit the long time, and you love a man without love, understanding.


In [67]:
generate_quote()

       Learn to know what the issue is my universe, how you do what they instead of beow.


In [68]:
generate_quote()

       My motto with the down and amazing state in the classic and face it because I was a DeficiI I would learn all the unity.


In [71]:
generate_quote()

       There is nothing without the past.


In [85]:
generate_quote()

     After much things that I had a good way to be able to place the material things in the form of our tatence will go to see their show in it.


In [89]:
generate_quote()

       Once you will love you so introversions who'll say there is important thing that are something good into something to be simpling my feet I could be a none is the real of a time to stand for video from an energy people.


In [90]:
generate_quote()

   A successful sadness of the game of our controversy for how hear, is wrong in the time, and I got a short men.


In [91]:
generate_quote()

   Happiness is so such a house why can hasten before, what you take every time with my encouraged by Phose.


In [124]:
generate_quote()

         If you are too lack friends and his things that are well in a books are not the volution of its heart that is a movie the banda used to always be a wanting to be long is confidentally good to one would be much more believe I do your poor.


In [125]:
# serialize model to JSON
model_json = model.to_json()
with open("model_char_2.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_char_2.h5")
print("Saved model to disk")

Saved model to disk
