In [94]:
import pandas as pd 
import numpy as np 
import re
import sys

from nltk import word_tokenize
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.models import model_from_json
from keras.layers import Input, Activation, Dense, Dropout
from keras.layers import LSTM, Bidirectional

In [116]:
quotables = pd.read_csv('author-quote.txt', delimiter='\t', header=None)
quotables = quotables.rename(columns={0:'author', 1:'quote'})

In [117]:
quotables

Unnamed: 0,author,quote
0,A. A. Milne,"If you live to be a hundred, I want to live to..."
1,A. A. Milne,Promise me you'll always remember: You're brav...
2,A. A. Milne,"Did you ever stop to think, and forget to star..."
3,A. A. Milne,Organizing is what you do before you do someth...
4,A. A. Milne,"Weeds are flowers too, once you get to know them."
5,A. A. Milne,You can't stay in your corner of the forest wa...
6,A. A. Milne,The third-rate mind is only happy when it is t...
7,A. A. Milne,Bores can be divided into two classes; those w...
8,A. A. Milne,"What I say is that, if a fellow really likes p..."
9,A. A. Milne,My spelling is Wobbly. It's good spelling but ...


In [162]:
list(quotables.author.unique()[-200:])

['T. D. Jakes',
 'T. Harv Eker',
 'T. S. Eliot',
 'Taissa Farmiga',
 'Tamara Ecclestone',
 'Tamara Feldman',
 'Tammy Baldwin',
 'Tammy Faye Bakker',
 'Tamsin Egerton',
 'Tarja Halonen',
 'Taylor Caldwell',
 'Taylor Dane',
 'Taylor Hackford',
 'Tecumseh',
 'Ted Baillieu',
 'Ted Danson',
 'Tennessee Williams',
 'Teri Garr',
 'Terrence J',
 'Terry Eagleton',
 'Terry Farrell',
 'Thayer David',
 'The Edge',
 'The Notorious B.I.G.',
 'Theaster Gates',
 'Theodor Adorno',
 'Theodor W. Adorno',
 'Theodore Dalrymple',
 'Theodore Roosevelt',
 'Thomas A. Edison',
 'Thomas Aquinas',
 'Thomas Campbell',
 'Thomas Campion',
 'Thomas Carlyle',
 'Thomas Chandler Haliburton',
 'Thomas Eakins',
 'Thomas Fuller',
 'Thomas Huxley',
 'Thomas Jefferson',
 'Thomas Merton',
 'Thomas P. Campbell',
 'Thomas Paine',
 'Thomas R. Insel',
 'Thomas Sowell',
 'Thornton Wilder',
 'Tim Cahill',
 'Tim Daly',
 'Tim Federle',
 'Tim Gane',
 'Tim Jackson',
 'Timothy Dalton',
 'Timothy F. Cahill',
 'Tisha Campbell-Martin',
 'T

In [4]:
len(list(quotables.author.unique()))

2297

In [5]:
quotables.loc[quotables.author == 'Anne Frank']

Unnamed: 0,author,quote
2346,Anne Frank,Whoever is happy will make others happy too.
2347,Anne Frank,"Despite everything, I believe that people are ..."
2348,Anne Frank,Think of all the beauty still left around you ...
2349,Anne Frank,"The best remedy for those who are afraid, lone..."
2350,Anne Frank,Everyone has inside of him a piece of good new...
2351,Anne Frank,Parents can only give good advice or put them ...
2352,Anne Frank,"Laziness may appear attractive, but work gives..."
2353,Anne Frank,I don't think of all the misery but of the bea...
2354,Anne Frank,How wonderful it is that nobody need wait a si...
2355,Anne Frank,In spite of everything I still believe that pe...


In [6]:
quotables['len_quotes'] = quotables.quote.map(lambda s: len(s))

In [7]:
quotes = list(quotables.quote + '\n')

In [8]:
removed_char = ['#', '$', '%', '(', ')', '=', ';' ,':',  '*', '+', '£' , '—','’']  
quotes_cleaned = []

for quote in quotes: 
    # remove unused character
    for s_char in removed_char:
        quote = quote.replace(s_char, ' ')
    
    # remove white space
    pattern = re.compile(r'\s{2,}')
    quote = re.sub(pattern, ' ', quote)

    quotes_cleaned.append(quote)

text = ' '.join(quotes_cleaned)
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [9]:
maxlen = 15
step = 6
sentences = []
next_chars = []

for quote in quotes_cleaned:
    for i in range(0, len(quote) - maxlen, step):
        sentences.append(quote[i: i + maxlen])
        next_chars.append(quote[i + maxlen])
    sentences.append(quote[-maxlen:])
    next_chars.append(quote[-1])
print('nb sequences:', len(sentences))

nb sequences: 753142


In [10]:
sentences

['If you live to ',
 ' live to be a h',
 'to be a hundred',
 'a hundred, I wa',
 'red, I want to ',
 ' want to live t',
 'to live to be a',
 'e to be a hundr',
 'e a hundred min',
 'ndred minus one',
 'minus one day s',
 'one day so I ne',
 'y so I never ha',
 ' never have to ',
 ' have to live w',
 'to live without',
 'e without you.\n',
 "Promise me you'",
 "e me you'll alw",
 "ou'll always re",
 'always remember',
 " remember You'r",
 "ber You're brav",
 "u're braver tha",
 'raver than you ',
 'than you believ',
 'ou believe, and',
 'ieve, and stron',
 'and stronger th',
 'ronger than you',
 ' than you seem,',
 'you seem, and s',
 'em, and smarter',
 'd smarter than ',
 'ter than you th',
 'han you think.\n',
 'Did you ever st',
 'u ever stop to ',
 ' stop to think,',
 'to think, and f',
 'nk, and forget ',
 'd forget to sta',
 'et to start aga',
 'o start again?\n',
 'Organizing is w',
 'zing is what yo',
 's what you do b',
 ' you do before ',
 'o before you do',
 're you do somet

In [11]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [12]:
x.shape, y.shape, len(chars)

((753142, 15, 73), (753142, 73), 73)

In [13]:
## Model 
print('Build model...')
input_sequences = Input((maxlen, len(chars)) , name="input_sequences")
lstm = Bidirectional(LSTM(256, return_sequences= True, input_shape=(maxlen, len(chars))), name = 'bidirectional')(input_sequences)
lstm = Dropout(0.1, name = 'dropout_bidirectional_lstm')(lstm)
lstm = LSTM(64, input_shape=(maxlen, len(chars)), name = 'lstm')(lstm)
lstm = Dropout(0.1,  name = 'drop_out_lstm')(lstm)

dense = Dense(15 * len(chars), name = 'first_dense')(lstm)
dense = Dropout(0.1,  name = 'drop_out_first_dense')(dense)
dense = Dense(5 * len(chars), name = 'second_dense')(dense)
dense = Dropout(0.1,  name = 'drop_out_second_dense')(dense)
dense = Dense(len(chars), name = 'last_dense')(dense)

next_char = Activation('softmax', name = 'activation')(dense)

model = Model([input_sequences], next_char)
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])


Build model...


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_sequences (InputLayer) (None, 15, 73)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 15, 512)           675840    
_________________________________________________________________
dropout_bidirectional_lstm ( (None, 15, 512)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                147712    
_________________________________________________________________
drop_out_lstm (Dropout)      (None, 64)                0         
_________________________________________________________________
first_dense (Dense)          (None, 1095)              71175     
_________________________________________________________________
drop_out_first_dense (Dropou (None, 1095)              0         
__________

In [15]:
model.fit([x], y,
         batch_size=128,
          epochs= 3
         )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11cfc5f60>

In [16]:
model.fit([x], y,
         batch_size=128,
          epochs= 3
         )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x113a27dd8>

In [20]:
model.fit([x], y,
         batch_size=256,
          epochs= 4
         )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1592c2c18>

In [24]:
model.fit([x], y,
         batch_size=1024,
          epochs= 2
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11fc5c4a8>

In [104]:
two_first_words = [bigram for bigram in [' '.join(word_tokenize(quote)[:2]) for quote in quotes] if len(bigram) <= maxlen]

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [106]:
len(two_first_words)

35166

In [109]:
np.random.randint(0, len(two_first_words))

775

In [145]:
def generate_quote(sentence = None, diversity = 0.8):
    
    if not sentence: ## if input is null then sample two first word from dataset
        random_index = np.random.randint(0, len(two_first_words))
        sentence = two_first_words[random_index]
        
    if len(sentence) > maxlen:
        sentence = sentence[-maxlen:]
    elif len(sentence) < maxlen:
        sentence = ' '*(maxlen - len(sentence)) + sentence
        
    generated = ''
    generated += sentence
    sys.stdout.write(generated)
    
    next_char = 'Empty'
    total_word = 0 
    
    max_word = 15
    
    while ((next_char not in ['\n', '.']) & (total_word <= 500)):
    
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        if next_char == ' ':
           total_word += 1
        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

In [126]:
generate_quote()

    Stop acting you don't exist the other whole refrection's rules.


In [151]:
generate_quote()

           I 'm like I.


In [153]:
generate_quote()

    Politics is the best through one of the band.


In [154]:
generate_quote()

      I believe you.


In [159]:
generate_quote()

         I find a lot of power.


In [160]:
generate_quote()

    Laughter is the cases of successful than that writing - certain ancess way to be on place.


In [180]:
generate_quote()

         If you moving with the world's problems, which gets that he can say and the way that is not happy.


In [181]:
# serialize model to JSON
model_json = model.to_json()
with open("model_char.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_char.h5")
print("Saved model to disk")

Saved model to disk
