# Generating Pokemon names with an RNN
Using character-level prediction of what the next character should be given the last X characters, this model eventually tries to generate realistic-sounding Pokemon names

In [1]:
from tensorflow.keras.layers import LSTM, Dense, Input, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model, load_model, Sequential
from tp_datasets import pokemon
import numpy as np

(819, 256, 256, 3)


## Getting the Pokemon names
<i>Get tp_datasets (and therefore the Pokemon dataset) <a href="https://github.com/tadeaspaule/tp-datasets">here</a></i>
- First we get a list of names of all real Pokemon<br>
- To remove noise in the model, I turned the names to lowercase and removed ~5ish Pokemon that have odd symbols in their name
- We also want to know the length of the longest and shortest name, this will come up later


In [2]:
_,names,_ = pokemon.load_data(return_full_names=False)


names = [name.lower() for name in names]
chars = sorted(list(set(''.join(names))))


unwanted = [' ', "'", '-', '.', '2']
def has_unwanted(word):
    for char in word:
        if char in unwanted:
            return True
    return False
names = [name for name in names if not has_unwanted(name)]
chars = [char for char in chars if char not in unwanted]

char_index = dict([(chars[i],i) for i in range(len(chars))])
maxlen = max([len(name) for name in names])
minlen = min([len(name) for name in names])
print("Longest name is",maxlen,"characters long")
print("Shortest name is",minlen,"characters long")

Longest name is 11 characters long
Shortest name is 3 characters long


## Getting the X - long sequences
- This model basically works by looking at X characters (in this case 4), and predicting what the next character will be
- Changing this X value will affect what patterns the model learns, if we make X too big it can simply memorize valid Pokemon names, but if we make it too small, it won't be able to accurately predict the next character
- I played around with it a bit and settled on 4, but feel free to try out different values (you should only have to change the value below in seqlen = 4 and the rest of the code will adjust itself based on that)

In [3]:
# Making the input sequences

seqlen = 4
endchar = '/'
assert(endchar not in chars)
chars += endchar
char_index[endchar] = len(chars) - 1
sequences = []
lengths = []     # To have the model learn a more macro understanding, it also takes the word's length so far
                 # as input
nextchars = []


for name in names:
    if len(name) < seqlen:
        sequences.append(name + endchar*(seqlen - len(name)))
        nextchars.append(endchar)
        lengths.append(len(name))
    else:
        for i in range(0,len(name)-seqlen+1):
            sequences.append(name[i:i+seqlen])
            if i+seqlen < len(name):
                nextchars.append(name[i+seqlen])
            else:
                nextchars.append(endchar)
            lengths.append(i+seqlen)
                

print(len(sequences),"sequences of length",seqlen,"made")

3569 sequences of length 4 made


## One hot encoding the sequences, word lengths, and next characters
- One hot encoding means that, for example, if you have 5 characters that can appear, you turn the first character into [1 0 0 0 0], the second into [0 1 0 0 0], and so on
- Here we do it with the sequences and with word lengths, because this format is easy for the model to read (and we need to somehow turn the sequence strings into number values)

In [4]:
# One hot encoding the sequences
# Also adding another input: the length of the word at the end of the sequence

x = np.zeros(shape=(len(sequences),seqlen,len(chars)), dtype='float32')
x2 = np.zeros(shape=(len(lengths),maxlen))

for i, seq in enumerate(sequences):
    for j, char in enumerate(seq):
        x[i,j,char_index[char]] = 1.

for i, l in enumerate(lengths):
    x2[i,l-1] = 1.

y = np.zeros(shape=(len(nextchars),len(chars)))
for i, char in enumerate(nextchars):
    y[i,char_index[char]] = 1.

## Method for generating random starting sequences
- Looks at the probabilities letters appear after each other (for example, how often is 'a' third when 'f' is second, compared to other letters that occur after a second 'f')
- We will use this later to make brand new Pokemon names

In [5]:
dictchars = [{} for _ in range(seqlen)]
total = 0

for name in names:
    if len(name) < seqlen:
        continue
    total += 1
    dictchars[0][name[0]] = dictchars[0].get(name[0],0) + 1
    for i in range(1,seqlen):
        if dictchars[i].get(name[i-1],0) == 0:
            dictchars[i][name[i-1]] = {name[i]: 1}
        elif dictchars[i][name[i-1]].get(name[i],0) == 0:
            dictchars[i][name[i-1]][name[i]] = 1
        else:
            dictchars[i][name[i-1]][name[i]] += 1
    
'''
What is dictchars?
Basically, stores how often a letter occurs after another letter at a specific spot in a Pokemon name
dictchars[0] just stores how often each letter is first, {a: 3, b:4, etc}

dictchars[1+] store which letters (and how often) come after a certain letter.
For example, if dictchars[1]['a'] = {b:4,c:1}, that means that if 'a' was first, 
b followed 4 times, while c followed only once.

This is used in the method below to generate plausible-sounding starting sequences.
'''
    

def generate_start_seq():
    res = "" # The starting sequence will be stored here
    p = sum([n for n in dictchars[0].values()]) # total amount of letter occurences
    r = np.random.randint(0,p) # random number used to pick the next character
    tot = 0
    for key, item in dictchars[0].items():
        if r >= tot and r < tot + item:
            res += key
            break
        else:
            tot += item

    for i in range(1,seqlen):
        ch = res[-1]
        if dictchars[i].get(ch,0) == 0:
            l = list(dictchars[i].keys())
            ch = l[np.random.randint(0,len(l))]
        p = sum([n for n in dictchars[i][ch].values()])
        r = np.random.randint(0,p)
        tot = 0
        for key, item in dictchars[i][ch].items():
            if r >= tot and r < tot + item:
                res += key
                break
            else:
                tot += item
    return res


## Methods for generating text
- The methods below basically take care of 'I give X letters, I get the full name', so that we can easily monitor the progress of the model (and combined with the above-declared method that makes random starting sequences, we won't even need to provide anything to get random Pokemon names)
- There is one concept used below called <i>temperature</i>. Basically it's a measure randomness plays when selecting the next letter, with 0 being no randomness, always picking the most likely letter, and 1 being total randomness, and the letters are chosen based on their probability value
- Adjusting this changes how your generated names look, typically the closer to 0 you are the more coherent and closely resembling the training data the output is, and the closer to 1 you are the more novel but sometimes also less coherent the output is. This mainly affects large generated texts though, not so much names. Nevertheless, I tend to go for ~0.4 temperature usually, but feel free to try out different values

In [6]:
def sample(preds,wordlength,temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    if temperature == 0:
        # Avoiding a division by 0 error
        return np.argmax(preds)
    preds = np.log(preds) / temperature
    
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1,preds,1)
    return np.argmax(probas)

def generate_name(model,start,temperature=1.0,maxlength=maxlen):
    res = start
    inp = np.zeros(shape=(1,seqlen,len(chars)))
    for i, char in enumerate(start):
        inp[0,i,char_index[char]] = 1.
    l = np.zeros(shape=(1,maxlen))
    l[0,len(res)] = 1.
    i = sample(model.predict(x=[inp,l])[0],len(res),temperature)
    while i < len(chars)-1 and len(res) < maxlength:
        res += chars[i]
        start = start[1:] + chars[i]
        inp = np.zeros(shape=(1,seqlen,len(chars)))
        for i, char in enumerate(start):
            inp[0,i,char_index[char]] = 1.
        l = np.zeros(shape=(1,maxlen))
        l[0,len(res)-1] = 1.
        i = sample(model.predict(x=[inp,l])[0],len(res),temperature)
    return res.title()

def generate_random_name(model,temperature=0.3):
    start = ""
    while len(start) != seqlen:
        try:
            start = generate_start_seq()
        except: pass
    return generate_name(model,start,temperature)

## Building the model
- Here is where you can experiment and try out different approaches
- After some testing, I went with the below setup:
    - 2 Inputs (the sequence, and the one-hot-encoded length of the name at the end of that sequence)
    - 2 parallel LSTM layers, one normal with relu, the other backwards with tanh, both dropout 0.4
    - Concatenate the LSTM output with the one-hot-encoded length
    - Dense output layer with softmax activation

In [7]:
inp1 = Input(shape=x.shape[1:]) # sequence input
inp2 = Input(shape=x2.shape[1:]) # length input
lstm = LSTM(len(chars),activation='relu',dropout=0.3)(inp1)
lstm2 = LSTM(len(chars),dropout=0.3,go_backwards=True)(inp1)
concat = concatenate([lstm,lstm2,inp2])
dense = Dense(len(chars),activation='softmax')(concat)

model = Model([inp1,inp2],dense)
model.compile(optimizer='adam',loss='binary_crossentropy')


## Method for training a model and monitoring its progress
- Using this makes it easy to try out different model architectures and see what names they are able to generate
- For fast prototyping, just build and compile a model, then simply:
```python
try_model(model)
```

In [8]:
def try_model(model,*,total_epochs=200,print_every=40,temperature=0.3,verbose=True):
    for i in range(total_epochs//print_every):
        history = model.fit([x,x2],y,
                            epochs=print_every,
                            batch_size=64,
                            validation_split=0.05,
                            verbose=0)
        if verbose:
            print("Epoch",(i+1)*print_every)
            print("First loss:            %1.4f" % (history.history['loss'][0]))
            print("Last loss:             %1.4f" % (history.history['loss'][-1]))
            print("First validation loss: %1.4f" % (history.history['val_loss'][0]))
            print("Last validation loss:  %1.4f" % (history.history['val_loss'][-1]))
            print()
            print("Generating random names:")
            for _ in range(10):
                print(generate_random_name(model,temperature))
            print()

## And finally, training the model and seeing how it does

In [9]:
try_model(model)

Epoch 40
First loss:            0.1553
Last loss:             0.1125
First validation loss: 0.1505
Last validation loss:  0.1055

Generating random names:
Hoceon
Gimeron
Aruser
Deas
Manker
Exer
Nignle
Aerfan
Donchas
Haxita

Epoch 80
First loss:            0.1125
Last loss:             0.1062
First validation loss: 0.1062
Last validation loss:  0.0972

Generating random names:
Caphor
Lulle
Rumine
Caile
Misoron
Goteet
Spertich
Felett
Loltoe
Chawitl

Epoch 120
First loss:            0.1060
Last loss:             0.1015
First validation loss: 0.0969
Last validation loss:  0.0913

Generating random names:
Gayton
Abratto
Phiserit
Notaro
Colling
Palasto
Cocnoon
Barp
Chuckow
Dimino

Epoch 160
First loss:            0.1013
Last loss:             0.0975
First validation loss: 0.0907
Last validation loss:  0.0853

Generating random names:
Clepin
Gountar
Kyurea
Shrriza
Vanaur
Seedoin
Luboar
Tofatho
Beim
Mamitar

Epoch 200
First loss:            0.0981
Last loss:             0.0942
First validation

## To generate more afterwards:
```python
for _ in range(100):
    print(generate_random_name(model,temperature=0.4))
```

## To save the model and use it later:
```python
model.save('pokename.h5') # Save the model you like...
```
And at a later time just get it back like this:<br>
(This assumes the .h5 file is in the current directory, otherwise specify the path to it)
```python
m = load_model('pokename.h5')
for _ in range(10):
    print(generate_random_name(m,temperature=0.4))
```

## To use your own starting sequence:
```python
my_starting_sequence = "Tyrm"
pokename = generate_name(model,my_starting_sequence)
print(pokename)
```