## Goal

Generate a baby name given the first two charaters

### Create data

In [1]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM, BatchNormalization, Dropout, Bidirectional, GRU

Using TensorFlow backend.


In [2]:
## number of characters to use as input
input_length = 2
## append to all names at the end, so can learn when to stop generating characters
end_name_token = "</name>"

In [3]:
## read in data
data_set = "./small_data/baby_names_all.txt"
names = []
with open(data_set, "r") as f:
    for line in f:
        names.append(line.lower().strip())

In [4]:
names[:5]

['abagael', 'abagail', 'abbe', 'abbey', 'abbi']

In [5]:
print(len(names))

7944


In [6]:
## create lookup dictionaries so each unique character gets a unique number
unique_chars = list(set([char for name in names for char in name]))
unique_chars = unique_chars + [end_name_token]

In [7]:
unique_chars[:2]

['z', 'h']

In [8]:
print(len(unique_chars))

30


In [9]:
char2index = {c: i for i, c in enumerate(unique_chars)}
index2char = {i: c for i, c in enumerate(unique_chars)}

In [10]:
## convert names to their index values
names_indexes = []
for name in names:
    index_name = []
    for c in name:
        index_name.append(char2index[c])
    index_name.append(char2index[end_name_token])
    names_indexes.append(index_name)

In [11]:
names[0]

'abagael'

In [12]:
names_indexes[0]

[5, 28, 5, 4, 5, 15, 20, 29]

In [13]:
''.join([index2char[i] for i in names_indexes[0]])

'abagael</name>'

In [14]:
## create X and y data for training
X = []
y = []
for n in names_indexes:
    length_name = len(n)
    for i in range(0, length_name-input_length):
        X.append(n[i:i+input_length])
        y.append(n[i+input_length])
X = np.array(X)
y = np.expand_dims(np.array(y),1)

In [15]:
X.shape

(39980, 2)

In [16]:
y.shape

(39980, 1)

## Model

In [56]:
## we have very little data, so let's make a smalle model
vocab_size = len(unique_chars)
n_fac = 4
n_hidden = 50

In [57]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=input_length),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True),
        LSTM(n_hidden),
        GRU(n_hidden, return_sequences=True)
        GRU(n_hidden, return_sequences=True)
        Dense(vocab_size, activation='softmax'),
    ])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [58]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2, 4)              120       
_________________________________________________________________
batch_normalization_5 (Batch (None, 2, 4)              16        
_________________________________________________________________
lstm_4 (LSTM)                (None, 2, 50)             11000     
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_5 (Dense)              (None, 30)                1530      
Total params: 32,866
Trainable params: 32,858
Non-trainable params: 8
_________________________________________________________________


In [59]:
model.fit(X, y, batch_size=32, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x12ec265c0>

In [20]:
## some helper functions to generate names!

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def name_from_two_starting_letters(two_char_string, temperature=1.0):
    two_chars_indexes = []
    generated_name = two_char_string
    for c in two_char_string:
        two_chars_indexes.append(char2index[c])
    predict_input_array = np.array([two_chars_indexes])
    predicted_index = -1
    while (predicted_index != char2index[end_name_token]):
        predicted_index = sample(model.predict(predict_input_array)[0], temperature)
        if predicted_index != char2index[end_name_token]:
            generated_name = generated_name + index2char[predicted_index]
            predict_input_array = np.array([[predict_input_array[0][1]] + [predicted_index]])
    return generated_name

In [21]:
name_from_two_starting_letters('em', temperature=0.5)

'emee'

In [60]:
def generate_n_random_names(n, possible_starts=['ab', 'ma', 'as', 'ch', 'em', 'cl'],
                           possible_temperatures = [0.2, 0.5, 1.0, 1.2]):
    random_names = []
    for _ in range(n):
        temperatue = np.random.choice(possible_temperatures, 1)
        starting_chars = ''.join(np.random.choice(possible_starts,1))
        random_names.append(name_from_two_starting_letters(starting_chars, temperatue))
    return random_names

In [62]:
random_names = generate_n_random_names(10, ['ba'], [0.5])
for name in random_names:
    print(name)

barly
bar
baren
bar
ban
barune
barlie
babrigine
barne
barin
