In [None]:
import pandas
import numpy
import random
import sys

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, GRU, Dropout
from keras.optimizers import RMSprop, Adam, SGD

In [None]:
cd ..

In [None]:
from swnamer.process import *

In [None]:
legend_names = pandas.read_csv('data/legend_names.csv')

In [None]:
canon_names = pandas.read_csv('data/cannon_names.csv')

In [None]:
cast_names = pandas.read_csv('data/cast_names.csv')

In [None]:
clone_wars_names = pandas.read_csv('data/clone_wars.csv')

In [None]:
kotor_names = pandas.read_csv('data/kotor.csv')

In [None]:
combined = pandas.concat((legend_names, canon_names, cast_names, clone_wars_names, kotor_names))

In [None]:
combined = combined.reset_index(drop=True)

In [None]:
combined.shape

In [None]:
combined.drop_duplicates().shape

In [None]:
combined = combined.drop_duplicates()

In [None]:
combined.sample(10)

In [None]:
combined.loc[:, 'name'] = combined.name.str.lower()

In [None]:
timesteps = 4

Since the RNN will be predicting the probability of a character based on the last timestep chars in the sequence
we pad each name with a special character.

In [None]:
combined['name'] = ('^' * timesteps) + combined.name + ('$' * timesteps)

In [None]:
combined.loc[:, 'length'] = combined.name.str.len()

In [None]:
combined.sample(10)

In [None]:
combined.length.max()

In [None]:
combined.to_csv('output/starwars_processed.csv', index=False)

In [None]:
token_to_index, index_to_token = create_indices(combined, 'name')

In [None]:
chars = token_to_index.keys()

In [None]:
chars

In [None]:
vocab_size = len(chars)
vocab_size

In [None]:
chunks, next_char = chunk_names(combined, 'name', timesteps)

In [None]:
X, y = create_training_vectors(chunks, next_char, token_to_index, timesteps, vocab_size)

In [None]:
model = Sequential()
model.add(LSTM(32, input_shape=(timesteps, vocab_size), return_sequences=True,))
model.add(LSTM(32, input_shape=(timesteps, vocab_size), return_sequences=False))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
optimizer = SGD(lr=.01, momentum=.99, nesterov=True)
model.compile(optimizer, 'categorical_crossentropy')
model.summary()

In [None]:
sampler = SampleNames(chunks, timesteps, vocab_size, token_to_index, index_to_token)
earlystopping = EarlyStopping(mode='min', patience=7, min_delta=.001)
checkpoint = ModelCheckpoint('output/starwars.{epoch:02d}-{val_loss:.2f}.hdf5', save_best_only=True)
callbacks = [sampler, earlystopping, checkpoint]

In [None]:
model.fit(X, y, epochs=100, batch_size=32, validation_split=.1, callbacks=callbacks)

In [None]:
#model.save('output/starwars-2-lstm-64.hdf5')

In [None]:
model = load_model('output/starwars.23-1.94.hdf5')

In [None]:
gen = NameGenerator(timesteps, vocab_size, token_to_index, index_to_token, model)

In [None]:
gen.generate(seed='^^^^', diversity=1.)

In [None]:
gen.generate(10, seed='dar')

In [None]:
gen.generate(10)

In [None]:
#commander papano sasnaphared borbilz

In [None]:
combined[combined.name.str.contains('fn')]

In [None]:
combined.sample(10)