Code for generating names, starting with the language model.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import html
import numpy as np

class CharacterEncoder(BaseEstimator, TransformerMixin):
    '''
    Transform a string into context and target character sequence numbers, using the ordinal
    value of each character.
    
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.context_length = context_length
        self.maximum_ordinal = maximum_ordinal
        
    def fit(self, strings, **kwargs):
        '''
        No need to fit.
        '''
        return self

    def transform(self, strings):
        '''
        Transform an iterable source of strings into a dense matrix
        of character identifiers.
        
        Each sample will be a string snippet of context_length characters.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character identifier, and
            a one dimensional [sample_index] with a 32 bit character identifier to predict.
        '''
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        # buffer up contexts and targets, we'll be predicting target characters
        # from context strings
        contexts = []
        targets = []
        for i, string in enumerate(strings):
            # lowercase and stripped of leading whitespace, makes the model more compact
            string = string.lower().strip()
            # null character termination for each string
            string += chr(0)
            for j in range(0, len(string) - self.context_length):
                contexts.append(string[j:j + self.context_length])
                targets.append(string[j + self.context_length])
        # blocks of memory to hold character ordinals
        X = np.zeros((len(contexts), self.context_length), dtype=np.int32)
        Y = np.zeros(len(targets), dtype=np.int32)
        # numerical encoding of character values
        for i, context in enumerate(contexts):
            for j, character in enumerate(context):
                X[i, j] = min(ord(character), self.maximum_ordinal)
        for i, character in enumerate(targets):
            Y[i] = min(ord(character), self.maximum_ordinal)
        return X, Y


In [2]:
class CharacterLanguageModelVectorizer(BaseEstimator, TransformerMixin):
    '''
    Base language model uses a CharacterEncoder to create character ordinals
    and then applies a transformation in order to create vectors.
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.sequencer = CharacterEncoder(context_length, maximum_ordinal)
    
    def fit(self, strings):
        '''
        Nothing to fit.
        '''
        return self
    
    def transform(self, strings):
        '''
        Transform strings into a dense (X, Y) pairing.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) three dimensional [sample_index, character_index, one_hot] context X and
            a two dimensional [sample_index, one_hot] target Y.
        '''
        # character sequence numbers
        X, Y = self.sequencer.transform(strings)
        # one hot context encoding
        x = np.zeros((X.shape[0], self.sequencer.context_length, self.sequencer.maximum_ordinal), dtype=np.bool)
        y = np.zeros((Y.shape[0], self.sequencer.maximum_ordinal), dtype=np.bool)
        for i, context in enumerate(X):
            for t, char in enumerate(context):
                x[i, t, char] = 1
        for i, target in enumerate(Y):
            y[i, target] = 1
        return x, y
    
    def inverse_transform(self, X):
        '''
        Given a matrix of one hot encodings, reverse the transformation and return a matrix of characters.
        '''
        ordinals = X.argmax(-1).astype(np.int32)
        decode = np.vectorize(chr)
        return ''.join(decode(ordinals))

Now we can build up a recurrent neural network to learn a language model.

In [3]:
from keras.models import Sequential
from keras.layers import LSTM, CuDNNLSTM, Dropout, Dense, Reshape, BatchNormalization

class RecurrentLanguageModel(BaseEstimator):
    '''
    Create a language model with a neural network and normalized character encoding.
    '''

    def __init__(self, vectorizer, hidden_layers=256, gpu_optimized=False):
        '''
        Parameters
        ----------
        vectorizer : transformer
            Object to transform input strings into numerical encodings.
        hidden_layers : int
            Size of the model's hidden layer, controls complexity.
        gpu_optimized : bool
            If True, use special code in keras to boost performance.
        '''
        self.hidden_layers = hidden_layers
        self.gpu_optimized = gpu_optimized
        self.vectorizer = vectorizer

    def fit(self, strings, epochs=256, batch_size=256):
        '''
        Create and fit a model to the passed in strings.
        
        Parameters
        ----------
        strings : iterable
            An iterable source of string text.
        '''
        if self.gpu_optimized:
            RNN = CuDNNLSTM
        else:
            RNN = LSTM
        X, Y = self.vectorizer.transform(strings)
        self.X = X
        self.Y = Y
        self.model = model = Sequential()
        # input shape is represented as the shape of a single batch entry
        model.add(RNN(self.hidden_layers, return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
        model.add(BatchNormalization())
        model.add(RNN(self.hidden_layers))
        model.add(BatchNormalization())
        model.add(Dense(self.hidden_layers, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(self.hidden_layers, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(Y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        model.fit(X, Y, epochs=epochs, batch_size=batch_size)

Using TensorFlow backend.
  return f(*args, **kwds)


Run this model with cities. I'm using a GPU - and very much recommend you do so! You can set the optimization to False if you need to use a CPU.

City names aren't sentences, so we need to use a relatively short context length.

In [4]:
vectorizer = CharacterLanguageModelVectorizer(context_length=3, maximum_ordinal=128)
model = RecurrentLanguageModel(vectorizer, hidden_layers=512, gpu_optimized=True)
with open('cities.txt', encoding='utf8') as cities:
    model.fit(cities.readlines())

Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
Epoch 61/256
Epoch 62/256
Epoch 63/256
Epoch 64/256
Epoch 65/256
Epoch 66/256
Epoch 67/256
Epoch 68/256
Epoch 69/256
Epoch 70/256
Epoch 71/256
Epoch 72/256
Epoch 73/256
Epoch 74/256
Epoch 75/256
Epoch 76/256
Epoch 77/256
Epoch 78

Epoch 92/256
Epoch 93/256
Epoch 94/256
Epoch 95/256
Epoch 96/256
Epoch 97/256
Epoch 98/256
Epoch 99/256
Epoch 100/256
Epoch 101/256
Epoch 102/256
Epoch 103/256
Epoch 104/256
Epoch 105/256
Epoch 106/256
Epoch 107/256
Epoch 108/256
Epoch 109/256
Epoch 110/256
Epoch 111/256
Epoch 112/256
Epoch 113/256
Epoch 114/256
Epoch 115/256
Epoch 116/256
Epoch 117/256
Epoch 118/256
Epoch 119/256
Epoch 120/256
Epoch 121/256
Epoch 122/256
Epoch 123/256
Epoch 124/256
Epoch 125/256
Epoch 126/256
Epoch 127/256
Epoch 128/256
Epoch 129/256
Epoch 130/256
Epoch 131/256
Epoch 132/256
Epoch 133/256
Epoch 134/256
Epoch 135/256
Epoch 136/256
Epoch 137/256
Epoch 138/256
Epoch 139/256
Epoch 140/256
Epoch 141/256
Epoch 142/256
Epoch 143/256
Epoch 144/256
Epoch 145/256
Epoch 146/256
Epoch 147/256
Epoch 148/256
Epoch 149/256
Epoch 150/256
Epoch 151/256
Epoch 152/256
Epoch 153/256
Epoch 154/256
Epoch 155/256
Epoch 156/256
Epoch 157/256
Epoch 158/256
Epoch 159/256
Epoch 160/256
Epoch 161/256
Epoch 162/256
Epoch 163/256


Epoch 181/256
Epoch 182/256
Epoch 183/256
Epoch 184/256
Epoch 185/256
Epoch 186/256
Epoch 187/256
Epoch 188/256
Epoch 189/256
Epoch 190/256
Epoch 191/256
Epoch 192/256
Epoch 193/256
Epoch 194/256
Epoch 195/256
Epoch 196/256
Epoch 197/256
Epoch 198/256
Epoch 199/256
Epoch 200/256
Epoch 201/256
Epoch 202/256
Epoch 203/256
Epoch 204/256
Epoch 205/256
Epoch 206/256
Epoch 207/256
Epoch 208/256
Epoch 209/256
Epoch 210/256
Epoch 211/256
Epoch 212/256
Epoch 213/256
Epoch 214/256
Epoch 215/256
Epoch 216/256
Epoch 217/256
Epoch 218/256
Epoch 219/256
Epoch 220/256
Epoch 221/256
Epoch 222/256
Epoch 223/256
Epoch 224/256
Epoch 225/256
Epoch 226/256
Epoch 227/256
Epoch 228/256
Epoch 229/256
Epoch 230/256
Epoch 231/256
Epoch 232/256
Epoch 233/256
Epoch 234/256
Epoch 235/256
Epoch 236/256
Epoch 237/256
Epoch 238/256
Epoch 239/256
Epoch 240/256
Epoch 241/256
Epoch 242/256
Epoch 243/256
Epoch 244/256
Epoch 245/256
Epoch 246/256
Epoch 247/256
Epoch 248/256
Epoch 249/256
Epoch 250/256
Epoch 251/256
Epoch 

Model converges relatively quickly -- more data or more training could be helpful.

And now for the interesting part -- generation. This uses a random seed to kick things off, and then pulls out characters until we hit a terminator.

In [8]:
class NameLanguageModelGenerator():
    '''
    Given a language model, generate new name strings given a seed of your own design.
    '''
    
    def __init__(self, language_model):
        '''
        Parameters
        ----------
        language_model
            A trained language model used to generate predictions.
        '''
        self.language_model = language_model
        
    def generate(self, seed, max_length=32):
        '''
        Parameters
        ----------
        seed : str
            A string to bootstrap generation.
        max_length: int
            A guard value to prevent looping forever.
        '''
        assert(len(seed) >= self.language_model.vectorizer.sequencer.context_length)
        # start appending after this number of context iterations
        append_after = len(seed) - self.language_model.vectorizer.sequencer.context_length
       
        # build up the result buffer here, adding on to our passed seed
        result = seed
        for i in range(0, max_length):
            X, _ = self.language_model.vectorizer.transform([seed])
            # only need the very first sample, then keep iterating
            try:
                next_character = self.language_model.vectorizer.inverse_transform(self.language_model.model.predict(X))[i]
            except IndexError:
                # when we hit a null character, it is time to exit
                break
            # keep expanding the seed with each generated character
            seed += next_character
            # and save off any results, expanding the seed string
            if i >= append_after:
                result += next_character
        return result.capitalize()

In a sense, this is a kind of a machine learning made up autocomplete, we'll start with a few characters and see what it tacks on!

In [9]:
NameLanguageModelGenerator(model).generate('Sam')

'Samont'