In [3]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense, Dropout, Dense, LSTM
import pandas as pd
from scrapy.crawler import CrawlerProcess
import scrapy
import json
import numpy as np
import keras.utils as ku 
from keras.preprocessing.sequence import pad_sequences
import logging

# Text Generator

The aim of this notebook is to create a text generator with LSTM. To do this, we scrape famous quotes data from http://quotes.toscrape.com/ and train a LSTM to generate quotes-like texts.

# Scrapping

In [4]:
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [5]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
        'http://quotes.toscrape.com/page/3/',
        'http://quotes.toscrape.com/page/4/',
        'http://quotes.toscrape.com/page/5/',
        'http://quotes.toscrape.com/page/6/',
        'http://quotes.toscrape.com/page/7/',
        'http://quotes.toscrape.com/page/8/',
        'http://quotes.toscrape.com/page/9/',
        'http://quotes.toscrape.com/page/10/'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1},
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'quoteresult.json'                       
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [6]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

2020-10-05 20:42:42 [scrapy.utils.log] INFO: Scrapy 2.3.0 started (bot: scrapybot)
2020-10-05 20:42:42 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:01:53) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Darwin-19.3.0-x86_64-i386-64bit
2020-10-05 20:42:42 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-10-05 20:42:42 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)



# Load data

**Load scraped data**

In [7]:
dfjson = pd.read_json('quoteresult.json')

**Clean**

In [8]:
dfjson['text'] = dfjson['text'].str.replace('“', '')
dfjson['text'] = dfjson['text'].str.replace('”', '')

**Transform the data frame to a list**

In [9]:
all_text = dfjson['text'].values.tolist()
len(all_text)

100

# LSTM

## Tranform corpus into sequences

**Tokenization**

* Remove punctuations
* Split texts into space separated sequences of words in lowercase
* The sequences are splited into lists of tokens 
* The lists of tokens are indexed (vectorized)

In [58]:
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences, total_words
input_sequences, total_words = get_sequence_of_tokens(all_text)
input_sequences[:10]

[[2, 69],
 [2, 69, 26],
 [2, 69, 26, 41],
 [2, 69, 26, 41, 21],
 [2, 69, 26, 41, 21, 274],
 [2, 69, 26, 41, 21, 274, 8],
 [2, 69, 26, 41, 21, 274, 8, 4],
 [2, 69, 26, 41, 21, 274, 8, 4, 5],
 [2, 69, 26, 41, 21, 274, 8, 4, 5, 275],
 [2, 69, 26, 41, 21, 274, 8, 4, 5, 275, 7]]

We add padding to the sequence to have the same length of every sequence

In [15]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

In [16]:
predictors.shape

(2259, 197)

## Modelisation

Create a LSTM:
* A single hidden LSTM layer with 100 memory units
* A Dropout layer with a probability of 10.
* An output layer:a Dense layer using the softmax activation function to output a probability prediction for each of words between 0 and 1.
* Our problem is a single word classification problem with n classes (n = vocabulary size) and as such is defined as optimizing the log loss (cross entropy), and use the ADAM optimization algorithm for speed.

In [18]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

## Training

In [19]:
%%time
model = create_model(max_sequence_len, total_words)
model.summary()

model.fit(predictors, label, epochs=100, verbose=5)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 197, 10)           6840      
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 684)               69084     
Total params: 120,324
Trainable params: 120,324
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x13acc7828>

## Generation

**Create a function that takes seed texts as input and predict the next words**

In [20]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        token_list = t.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ''
        
        for word,index in t.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

**Genearate some sequences**

In [21]:
print(generate_text("life is short", 20, model, max_sequence_len))

Life Is Short Not Friend Is A Book With You Have A House Of Value Be Know How Strong It I Have Not


In [24]:
print(generate_text("The World ", 20, model, max_sequence_len))

The World  That Having An Open Mind Of Course Is A Garage Can Make You Be Could Call Together But The World


In [25]:
print(generate_text("Happiness ", 20, model, max_sequence_len))

Happiness  Believe Lies You Have Not Make The Cup Of Success Rather Only Not More For The Beholder The More That


In [26]:
print(generate_text("Success ", 15, model, max_sequence_len))

Success  Not Thing Whom You A Sleepy Conscience This Is The Ideal Life Is The World


The sequences generated by LSTM contain words that are often used in quotes. However, they could not look realistic quotes because of grammar.

A few thoughts on improvements: more training data, more training epochs, more layers, more memory units to the layers, predict fewer number of words as output for a given seed.
