In [1]:
import pandas
import numpy
import random
import sys

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, GRU
from keras.optimizers import RMSprop, Adam, SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
cd ..

/Users/mmccurdy/code/starwars-namer


In [3]:
from swnamer.process import chunk_names, chunk_names_file, create_indices_file, create_indices, SampleNames, SampleNamesFile, create_training_vectors

In [4]:
male = pandas.read_csv('data/male.txt', header=5, names=['name'])

In [5]:
female = pandas.read_csv('data/female.txt', header=5, names=['name'])

In [6]:
male.shape, female.shape

((2942, 1), (5000, 1))

In [7]:
male.columns

Index(['name'], dtype='object')

In [8]:
male.sample(20)

Unnamed: 0,name
2486,Tailor
1942,Pascal
1646,Martino
2149,Robb
1137,Henderson
1246,Ira
1767,Morris
1055,Guthrey
2669,Ulick
412,Casey


In [9]:
female.sample(20)

Unnamed: 0,name
409,Audrye
4575,Tabbitha
3447,Mead
1835,Francoise
4200,Roseann
900,Charlene
4165,Rona
4467,Sinead
1903,Gennifer
4830,Verine


In [10]:
combined = pandas.concat([male, female])

In [11]:
# shuffle
combined = combined.sample(frac=1, random_state=38974)

In [12]:
combined

Unnamed: 0,name
2896,Laurianne
721,Elbert
199,Austin
301,Anni
645,Dionis
2475,Tab
2211,Rubin
4037,Randene
728,Eliot
773,Carmelina


In [13]:
combined.loc[:, 'name'] = combined.name.str.lower()

In [14]:
combined

Unnamed: 0,name
2896,laurianne
721,elbert
199,austin
301,anni
645,dionis
2475,tab
2211,rubin
4037,randene
728,eliot
773,carmelina


In [16]:
timesteps = 3

In [17]:
combined['name'] = ('^' * timesteps) + combined.name

In [18]:
token_to_index, index_to_token = create_indices(combined, 'name')

In [19]:
chunks, next_char = chunk_names(combined, 'name', timesteps)

In [21]:
vocab_size = len(token_to_index)
vocab_size

31

In [None]:
X, y = create_training_vectors(chunks, next_char, token_to_index, timesteps, vocab_size)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, vocab_size)))
model.add(LSTM(128, input_shape=(timesteps, vocab_size), return_sequences=True))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
optimizer = Adam(lr=.01, clipvalue=5)
model.compile(optimizer, 'categorical_crossentropy')

In [None]:
model.fit(X, y, epochs=200, batch_size=128, callbacks=[SampleNames(chunks, timesteps, vocab_size, token_to_index, index_to_token)])

In [None]:
combined.to_csv('output/standard_names.csv', index=False, header=False)

In [4]:
with open('output/standard_names.csv', 'r') as infile:
    text = infile.read()

In [5]:
token_to_index, index_to_token = create_indices_file(text)

In [6]:
vocab_size = len(token_to_index)

In [7]:
timesteps = 2

In [8]:
chunks, next_char = chunk_names_file(text, timesteps, stepsize=1)

In [9]:
X, y = create_training_vectors(chunks, next_char, token_to_index, timesteps, vocab_size)

In [10]:
train_end = int(X.shape[0] * .8)

In [11]:
X_train, X_valid, y_train, y_valid = X[:train_end], X[train_end:], y[:train_end], y[train_end:]

In [12]:
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, vocab_size), return_sequences=False))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
optimizer = SGD(lr=.01, momentum=.99, nesterov=True, clipvalue=1)
model.compile(optimizer, 'categorical_crossentropy')

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               81408     
_________________________________________________________________
dense_1 (Dense)              (None, 30)                3870      
_________________________________________________________________
activation_1 (Activation)    (None, 30)                0         
Total params: 85,278
Trainable params: 85,278
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(X_train, y_train, epochs=81, batch_size=128, 
          callbacks=[SampleNamesFile(timesteps, vocab_size, token_to_index, index_to_token, text)],
         validation_data=(X_valid, y_valid))

Train on 44682 samples, validate on 11171 samples
Epoch 1/81

----- Generating text after Epoch: 0
----- Generating with seed: "r"
----- diversity: 0.2
re
----- diversity: 0.5
rriadna
----- diversity: 1.0
rkolno
----- diversity: 1.2
raewe
Epoch 2/81
Epoch 3/81
Epoch 4/81
Epoch 5/81
Epoch 6/81

----- Generating text after Epoch: 5
----- Generating with seed: "n"
----- diversity: 0.2
ne
----- diversity: 0.5
ne
----- diversity: 1.0
ntevaiailisha
----- diversity: 1.2
nayl
Epoch 7/81
Epoch 8/81
Epoch 9/81
Epoch 10/81
Epoch 11/81

----- Generating text after Epoch: 10
----- Generating with seed: "j"
----- diversity: 0.2
janna
----- diversity: 0.5
jandoda
----- diversity: 1.0
jee
----- diversity: 1.2
janane
Epoch 12/81
Epoch 13/81
Epoch 14/81
Epoch 15/81
Epoch 16/81

----- Generating text after Epoch: 15
----- Generating with seed: "x"
----- diversity: 0.2
xie
----- diversity: 0.5
xe
----- diversity: 1.0
xi
----- diversity: 1.2
xuiannick
Epoch 17/81
Epoch 18/81
Epoch 19/81
Epoch 20/81
Epoch 2

<keras.callbacks.History at 0x10ab310b8>

In [15]:
model.save('output/generic-names-81-epochs.hdf5')

In [16]:
token_to_index.keys()

dict_keys(['l', 'a', 'u', 'r', 'i', 'n', 'e', '\n', 'b', 't', 's', 'd', 'o', 'c', 'm', 'p', 'g', 'k', 'y', 'j', 'v', 'h', '-', 'w', 'f', 'q', 'x', 'z', ' ', "'"])

In [None]:
com