Code for generating sentences, starting with a language model. This differs in that it works on words -- with embeddings -- rather than on individual characters.


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class WordLanguageModelVectorizer(BaseEstimator, TransformerMixin):
    '''
    Base language model uses a CharacterEncoder to create character ordinals
    and then applies a transformation in order to create vectors.
    '''
    def __init__(self, context_length=64):
        '''
        Parameters
        ----------
        context_length : int
            This number of words will be used as a context to predict future words.
        '''
        self.context_length = context_length
        self.sequencer = CountVectorizer()
    
    def fit(self, strings):
        '''
        Fit the word vocabulary to target strings.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
        '''
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        self.sequencer.fit(strings)
        self.sequencer.inverse_vocabulary_ = {sequence: word for word, sequence in self.sequencer.vocabulary_.items()}
        self.unique_words = len(self.sequencer.inverse_vocabulary_)
        return self
    
    def transform(self, strings):
        '''
        Transform strings into a (X, Y) pairing.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) three dimensional [sample_index, character_index] context X with a word sequence number
            to be embedded, and a two dimensional [sample_index, one_hot] target Y.
        '''
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        # start off by turning all the text into a series of integers
        word_sequence_numbers = []
        for string in strings:
            as_words = self.sequencer.build_analyzer()(string)
            word_sequence_numbers += list(map(self.sequencer.vocabulary_.get, as_words))
        # pad to the minimum context length
        if len(word_sequence_numbers) <= self.context_length:
            word_sequence_numbers = [0] * (1 + self.context_length - len(word_sequence_numbers)) + word_sequence_numbers
            
        # make this number of overlappinq sequences
        # ex with context 2: The quick brown fox likes chickens
        # The quick -> brown
        # quick brown -> fox
        number_of_contexts = len(word_sequence_numbers) - self.context_length
        # sequence numbers for context words
        x = np.zeros((number_of_contexts, self.context_length), dtype=np.int32)
        # one hot encodings for target words
        y = np.zeros((number_of_contexts, self.unique_words), dtype=np.bool)
        for i in range(number_of_contexts):
            context = np.array(word_sequence_numbers[i:i+self.context_length])
            x[i] = context
            target = word_sequence_numbers[i+self.context_length]
            y[i, target] = True
        return x, y
    
    def inverse_transform(self, X):
        '''
        Given a matrix of one hot encodings, reverse the transformation and return a matrix of characters.
        '''
        ordinals = X.argmax(-1)
        decoder = np.vectorize(self.sequencer.inverse_vocabulary_.get)
        # allow for single words or lists of words
        decoded = np.array([decoder(ordinals)])
        return ' '.join(decoded.flatten())

In [2]:
vectorizer = WordLanguageModelVectorizer()
with open('the_adventures_of_tom_sawyer.txt', encoding='utf8') as books:
    X, Y = vectorizer.fit_transform(books.readlines())

Let's take a look at the encoding as vectors.

In [3]:
X[0], Y[0], Y[0].argmax(-1)

(array([6770, 5133, 3068, 2111, 4580, 6770,  233, 4580, 6910, 5712, 1334,
         971, 4095, 7078, 5684, 1219, 6801, 2111, 3609, 2656, 6770, 7221,
        4580,  369,  374,  477, 4472, 1487,  341, 7598,  299, 4472, 5498,
        7487, 7729, 4126, 1470, 3616, 2884, 3616,  533, 4623, 5303, 7221,
        3616, 7124, 6770, 6746, 4580, 6770, 5133, 3068, 3890, 3465, 7598,
        6801, 2111, 4623, 4605,  477, 7675, 3068, 4443, 6885], dtype=int32),
 array([False, False, False, ..., False, False, False], dtype=bool),
 6770)

And -- inverse transformation, turning one-hots back into words!

In [4]:
vectorizer.inverse_transform(Y[0:100])

'the adventures of tom sawyer complete author mark twain samuel clemens release date august 20 2006 ebook 74 last updated june 2017 language english character set encoding utf start of this project gutenberg ebook tom sawyer produced by david widger the adventures of tom sawyer by mark twain samuel langhorne clemens contents chapter tom aunt polly decides upon her duty tom practices music the challenge private entrance chapter ii strong temptations strategic movements the innocents beguiled chapter iii tom as general triumph and reward dismal felicity commission and omission chapter iv mental acrobatics attending sunday school the superintendent showing off'

Now we can build up a recurrent neural network to learn a language model.

In [5]:
from keras.models import Sequential
from keras.layers import LSTM, CuDNNLSTM, Dropout, Dense, Reshape, BatchNormalization, Embedding

class EmbeddedRecurrentLanguageModel(BaseEstimator):
    '''
    Create a language model with a neural network and normalized character encoding.
    '''

    def __init__(self, vectorizer, hidden_layers=256, gpu_optimized=False):
        '''
        Parameters
        ----------
        vectorizer : transformer
            Object to transform input strings into numerical encodings.
        hidden_layers : int
            Size of the model's hidden layer, controls complexity.
        gpu_optimized : bool
            If True, use special code in keras to boost performance.
        '''
        self.hidden_layers = hidden_layers
        self.gpu_optimized = gpu_optimized
        self.vectorizer = vectorizer

    def fit(self, strings, epochs=256, batch_size=256):
        '''
        Create and fit a model to the passed in strings.
        
        Parameters
        ----------
        strings : iterable
            An iterable source of string text.
        '''
        if self.gpu_optimized:
            RNN = CuDNNLSTM
        else:
            RNN = LSTM
        X, Y = self.vectorizer.fit_transform(strings)
        self.X = X
        self.Y = Y
        self.model = model = Sequential()
        # begin by embedding character positions
        model.add(Embedding(self.vectorizer.unique_words, self.hidden_layers, input_shape=(X.shape[1],)))
        # and then work on the embeddings recurrently
        model.add(RNN(self.hidden_layers, return_sequences=True))
        model.add(BatchNormalization())
        model.add(RNN(self.hidden_layers))
        model.add(BatchNormalization())
        model.add(Dense(self.hidden_layers, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(self.hidden_layers, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(Y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        model.fit(X, Y, epochs=epochs, batch_size=batch_size)

Using TensorFlow backend.
  return f(*args, **kwds)


Run this model with cities. I'm using a GPU - and very much recommend you do so! You can set the optimization to False if you need to use a CPU.

City names aren't sentences, so we need to use a relatively short context length.

In [6]:
vectorizer = WordLanguageModelVectorizer()
model = EmbeddedRecurrentLanguageModel(vectorizer, gpu_optimized=True)
with open('the_adventures_of_tom_sawyer.txt', encoding='utf8') as books:
    model.fit(books.readlines())

Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
Epoch 61/256
Epoch 62/256
Epoch 63/256
Epoch 64/256
Epoch 65/256
Epoch 66/256
Epoch 67/256
Epoch 68/256
Epoch 69/256
Epoch 70/256
Epoch 71/256
Epoch 72/256
Epoch 73/256
Epoch 74/256
Epoch 75/256
Epoch 76/256
Epoch 77/256
Epoch 78

Epoch 93/256
Epoch 94/256
Epoch 95/256
Epoch 96/256
Epoch 97/256
Epoch 98/256
Epoch 99/256
Epoch 100/256
Epoch 101/256
Epoch 102/256
Epoch 103/256
Epoch 104/256
Epoch 105/256
Epoch 106/256
Epoch 107/256
Epoch 108/256
Epoch 109/256
Epoch 110/256
Epoch 111/256
Epoch 112/256
Epoch 113/256
Epoch 114/256
Epoch 115/256
Epoch 116/256
Epoch 117/256
Epoch 118/256
Epoch 119/256
Epoch 120/256
Epoch 121/256
Epoch 122/256
Epoch 123/256
Epoch 124/256
Epoch 125/256
Epoch 126/256
Epoch 127/256
Epoch 128/256
Epoch 129/256
Epoch 130/256
Epoch 131/256
Epoch 132/256
Epoch 133/256
Epoch 134/256
Epoch 135/256
Epoch 136/256
Epoch 137/256
Epoch 138/256
Epoch 139/256
Epoch 140/256
Epoch 141/256
Epoch 142/256
Epoch 143/256
Epoch 144/256
Epoch 145/256
Epoch 146/256
Epoch 147/256
Epoch 148/256
Epoch 149/256
Epoch 150/256
Epoch 151/256
Epoch 152/256
Epoch 153/256
Epoch 154/256
Epoch 155/256
Epoch 156/256
Epoch 157/256
Epoch 158/256
Epoch 159/256
Epoch 160/256
Epoch 161/256
Epoch 162/256
Epoch 163/256
Epoch 164/256

Epoch 182/256
Epoch 183/256
Epoch 184/256
Epoch 185/256
Epoch 186/256
Epoch 187/256
Epoch 188/256
Epoch 189/256
Epoch 190/256
Epoch 191/256
Epoch 192/256
Epoch 193/256
Epoch 194/256
Epoch 195/256
Epoch 196/256
Epoch 197/256
Epoch 198/256
Epoch 199/256
Epoch 200/256
Epoch 201/256
Epoch 202/256
Epoch 203/256
Epoch 204/256
Epoch 205/256
Epoch 206/256
Epoch 207/256
Epoch 208/256
Epoch 209/256
Epoch 210/256
Epoch 211/256
Epoch 212/256
Epoch 213/256
Epoch 214/256
Epoch 215/256
Epoch 216/256
Epoch 217/256
Epoch 218/256
Epoch 219/256
Epoch 220/256
Epoch 221/256
Epoch 222/256
Epoch 223/256
Epoch 224/256
Epoch 225/256
Epoch 226/256
Epoch 227/256
Epoch 228/256
Epoch 229/256
Epoch 230/256
Epoch 231/256
Epoch 232/256
Epoch 233/256
Epoch 234/256
Epoch 235/256
Epoch 236/256
Epoch 237/256
Epoch 238/256
Epoch 239/256
Epoch 240/256
Epoch 241/256
Epoch 242/256
Epoch 243/256
Epoch 244/256
Epoch 245/256
Epoch 246/256
Epoch 247/256
Epoch 248/256
Epoch 249/256
Epoch 250/256
Epoch 251/256
Epoch 252/256
Epoch 

In [7]:
model.vectorizer = WordLanguageModelVectorizer()
with open('the_adventures_of_tom_sawyer.txt', encoding='utf8') as books:
    X, Y = model.vectorizer.fit_transform(books.readlines())

Now for generation. Any words you like -- which will be padded as needed.

In [8]:
class SentenceLanguageModelGenerator():
    '''
    Given a language model, generate new name strings given a seed of your own design.
    '''
    
    def __init__(self, language_model):
        '''
        Parameters
        ----------
        language_model
            A trained language model used to generate predictions.
        '''
        self.language_model = language_model
        
    def generate(self, seed, max_length=64):
        '''
        Parameters
        ----------
        seed : str
            A string to bootstrap generation.
        max_length: int
            A guard value to prevent looping forever.
        '''
       
        # build up the result buffer here, adding on to our passed seed
        result = seed
        for i in range(0, max_length):
            # working on the right most context
            X, _ = self.language_model.vectorizer.transform([seed])
            context = np.array([X[-1]])
            # only need the very first sample, then keep iterating
            try:
                prediction = self.language_model.model.predict(context)[0]
                next_word = self.language_model.vectorizer.inverse_transform(prediction)
            except IndexError:
                # when we hit a null character, it is time to exit
                break
            # keep expanding the seed with each word
            seed += ' ' + next_word
        return seed

In a sense, this is a kind of a machine learning made up autocomplete, we'll start with a few words and see what it tacks on!

In [9]:
SentenceLanguageModelGenerator(model).generate('I would like to understand all of the ways')

'I would like to understand all of the ways country summer mighty right grave in in our front secret of for injun ever joe have harper been always away built he or another stop kite tom strings had stick presence to with work one of sort them of men men were were gone in and the late afternoon tavern after and breakfast they although presently the the book welshman almost and went they'