## Goal

Generate a baby name given the first two charaters

### Create data

In [5]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM, BatchNormalization, Dropout, Bidirectional, GRU

In [6]:
## number of characters to use as input
input_length = 2
## append to all names at the end, so can learn when to stop generating characters
end_name_token = "</name>"

In [7]:
## read in data
# http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/
data_set = "./small_data/baby_names_all.txt"
names = []
with open(data_set, "r") as f:
    for line in f:
        names.append(line.lower().strip())

In [8]:
names[:5]

['abagael', 'abagail', 'abbe', 'abbey', 'abbi']

In [9]:
print(len(names))

7944


In [10]:
## create lookup dictionaries so each unique character gets a unique number
unique_chars = list(set([char for name in names for char in name]))
unique_chars = unique_chars + [end_name_token]

In [11]:
unique_chars[:2]

['s', 'p']

In [12]:
print(len(unique_chars))

30


In [13]:
char2index = {c: i for i, c in enumerate(unique_chars)}
index2char = {i: c for i, c in enumerate(unique_chars)}

In [14]:
## convert names to their index values
names_indexes = []
for name in names:
    index_name = []
    for c in name:
        index_name.append(char2index[c])
    index_name.append(char2index[end_name_token])
    names_indexes.append(index_name)

In [15]:
names[0]

'abagael'

In [16]:
names_indexes[0]

[19, 15, 19, 7, 19, 26, 3, 29]

In [17]:
''.join([index2char[i] for i in names_indexes[0]])

'abagael</name>'

In [18]:
## create X and y data for training
X = []
y = []
for n in names_indexes:
    length_name = len(n)
    for i in range(0, length_name-input_length):
        X.append(n[i:i+input_length])
        y.append(n[i+input_length])
X = np.array(X)
y = np.expand_dims(np.array(y),1)

In [19]:
X.shape

(39980, 2)

In [20]:
y.shape

(39980, 1)

## Model

In [21]:
## we have very little data, so let's make a smalle model
vocab_size = len(unique_chars)
n_fac = 4
n_hidden = 50

In [23]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=input_length),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True),
        LSTM(n_hidden),
        Dense(vocab_size, activation='softmax'),
    ])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 4)              120       
_________________________________________________________________
batch_normalization_1 (Batch (None, 2, 4)              16        
_________________________________________________________________
lstm_1 (LSTM)                (None, 2, 50)             11000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 30)                1530      
Total params: 32,866
Trainable params: 32,858
Non-trainable params: 8
_________________________________________________________________


In [40]:
model.fit(X, y, batch_size=128, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f6b00c25358>

In [41]:
## some helper functions to generate names!

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def name_from_two_starting_letters(two_char_string, temperature=1.0):
    two_chars_indexes = []
    generated_name = two_char_string
    for c in two_char_string:
        two_chars_indexes.append(char2index[c])
    predict_input_array = np.array([two_chars_indexes])
    predicted_index = -1
    while (predicted_index != char2index[end_name_token]):
        predicted_index = sample(model.predict(predict_input_array)[0], temperature)
        if predicted_index != char2index[end_name_token]:
            generated_name = generated_name + index2char[predicted_index]
            predict_input_array = np.array([[predict_input_array[0][1]] + [predicted_index]])
    return generated_name

In [42]:
name_from_two_starting_letters('em', temperature=0.5)

'emer'

In [43]:
def generate_n_random_names(n, possible_starts=['ab', 'ma', 'as', 'ch', 'em', 'cl'],
                           possible_temperatures = [0.2, 0.5, 1.0, 1.2]):
    random_names = []
    for _ in range(n):
        temperatue = np.random.choice(possible_temperatures, 1)
        starting_chars = ''.join(np.random.choice(possible_starts,1))
        random_names.append(name_from_two_starting_letters(starting_chars, temperatue))
    return random_names

In [47]:
random_names = generate_n_random_names(10, ['az'], [0.5])
for name in random_names:
    print(name)

azel
azie
azola
az
azie
azakim
azo
azebe
az
azi
