In [1]:
import pandas
import numpy
import random
import sys

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, GRU
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
cd ..

/Users/mmccurdy/code/starwars-namer


In [3]:
from swnamer.process import chunk_names, create_indices

In [21]:
legend_names = pandas.read_csv('data/legend_names.csv')

In [22]:
canon_names = pandas.read_csv('data/cannon_names.csv')

In [23]:
cast_names = pandas.read_csv('data/cast_names.csv')

In [24]:
clone_wars_names = pandas.read_csv('data/clone_wars.csv')

In [25]:
kotor_names = pandas.read_csv('data/kotor.csv')

In [26]:
combined = pandas.concat((legend_names, canon_names, cast_names, clone_wars_names, kotor_names))

In [27]:
combined = combined.reset_index(drop=True)

In [28]:
combined.shape

(1093, 1)

In [29]:
combined.drop_duplicates().shape

(760, 1)

In [30]:
combined = combined.drop_duplicates()

In [31]:
combined.sample(20)

Unnamed: 0,name
274,Aiolin and Morit Astarte
63,Roan Fel
248,Fang Zar
481,Ruwee Naberrie
569,Admiral Terrinald Screed
298,Borvo the Hutt
27,Yomin Carr
372,Adi Gallia
386,Nute Gunray
456,Logray


In [32]:
combined.loc[:, 'name'] = combined.name.str.lower()

In [33]:
combined.loc[:, 'length'] = combined.name.str.len()

In [34]:
combined.length.max()

27

In [35]:
combined[combined.length == 27]

Unnamed: 0,name,length
312,lieutenant kaydel ko connix,27
478,admiral conan antonio motti,27
866,queen breha antilles organa,27


In [37]:
combined.to_csv('output/starwars_processed.csv', index=False)

In [38]:
token_to_index, index_to_token = create_indices(combined, 'name')

In [39]:
chars = token_to_index.keys()

In [40]:
vocab_size = len(chars)
vocab_size

46

In [47]:
padded = pad_sequences(x_train, maxlen=maxlen, padding='post', truncating='post')

In [56]:
timesteps = 3

In [57]:
chunks, next_char = chunk_names(combined, 'name', timesteps)

In [None]:
X = numpy.zeros((len(chunks), timesteps, vocab_size))
y = numpy.zeros((len(chunks), vocab_size))

In [None]:
for i, chunk in enumerate(chunks):
    for t, token in enumerate(chunk):
        index = token_to_index[token]
        X[i, t, index] = 1
    index = token_to_index[next_char[i]]
    y[i, index] = 1

In [None]:
model = Sequential()
model.add(GRU(128, input_shape=(timesteps, vocab_size), return_sequences=True))
model.add(GRU(128))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=.1, clipvalue=12)
model.compile(optimizer, 'categorical_crossentropy')

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = numpy.asarray(preds).astype('float64')
    preds = numpy.log(preds) / temperature
    exp_preds = numpy.exp(preds)
    preds = exp_preds / numpy.sum(exp_preds)
    probas = numpy.random.multinomial(1, preds, 1)
    return numpy.argmax(probas)


def on_epoch_end(epoch, logs):
    if epoch % 10 != 0:
        return
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(chunks) - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = chunks[start_index]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(50):
            x_pred = numpy.zeros((1, timesteps, vocab_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, token_to_index[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = index_to_token[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
            
            if next_char == '\n':
                break

            sys.stdout.write(next_char)
            sys.stdout.flush()
    
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(X, y, epochs=200, batch_size=128, callbacks=[print_callback])