In [1]:
import tensorflow as tf
import tensorflow.keras.layers as tfl
import numpy as np
import matplotlib.pyplot as plt
import os
import random

# print(tf.__version__)
# print(tf.config.list_physical_devices('GPU'))

# Data preparation

In [2]:
with open("dinos.txt", "r") as f:
    dinos_str = f.read().lower()

vocabular = sorted(set(dinos_str))
print(f"Vocabular [{len(vocabular)} chars]: {vocabular}\n")

dinos = [ds.strip() for ds in dinos_str.split("\n")]
lens = [(len(d), d) for d in dinos]
maxlen, d = max(lens)
maxidx = lens.index((maxlen, d))
print(f"maxlen = {maxlen}, dino={d}, idx = {maxidx}\n")

char_to_ix = { ch:i for i,ch in enumerate(vocabular) }
ix_to_char = { i:ch for i,ch in enumerate(vocabular) }

print(f"char_to_ix = {char_to_ix}\n")
print(f"ix_to_char = {ix_to_char}\n")

n_vocab_size = 27 # 26 lower english letters + \n
n_a = 50 # number of state units

inputs = [[None] + [char_to_ix[char] for char in dino] for dino in dinos]
outputs = [x[1:] + [0] for x in inputs]
print(f"len(inputs) = {len(inputs)}, len(outputs) = {len(outputs)}\n")
print(f"inputs[200] = {inputs[200]}")
print(f"outputs[200] = {outputs[200]}\n")


def indexes_to_one_hot_vectors(inputs, ohv_dim):
    vectors = []
    for item in inputs:
        vector = np.zeros((len(item), ohv_dim))
        
        for i, idx in enumerate(item):
            if idx is None:
                vector[i] = [0] * ohv_dim
            else:
                vector[i][idx] = 1

        vectors.append(vector)
        
    return vectors


x_sequences = indexes_to_one_hot_vectors(inputs, n_vocab_size)
y_sequences = indexes_to_one_hot_vectors(outputs, n_vocab_size)

X_padded = tf.keras.utils.pad_sequences(x_sequences[:], value=-1.0, padding='post', dtype='float32')
Y_padded = tf.keras.utils.pad_sequences(y_sequences[:], value=-1.0, padding='post', dtype='float32')

print(f"X_padded.shape = {X_padded.shape}")
print(f"Y_padded.shape = {Y_padded.shape}\n")

Vocabular [27 chars]: ['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

maxlen = 26, dino=lisboasaurusliubangosaurus, idx = 791

char_to_ix = {'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}

ix_to_char = {0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}

len(inputs) = 1536, len(outputs) = 1536

inputs[200] = [None, 2, 9, 5, 14, 15, 19, 1, 21, 18, 21, 19]
outputs[200] = [2, 9, 5, 14, 15, 19, 1, 21, 18, 21, 19, 0]

X_padded.shape = (1536, 27, 27)
Y_padded.shape = (1536, 27, 27)



# Sampling

In [3]:
def sample_sequence(model, newline_char=0, max_len=27):
    counter = 0
    indices = []
    x = np.zeros((1, n_vocab_size))
    idx = -1
    state = None
    
    while idx != newline_char and counter < max_len:
        y_pred = model.predict(np.expand_dims(x, axis=0), verbose=0)
        probs = y_pred[0, -1, :]
        idx = np.random.choice(range(len(probs)), p=probs)
        if idx == 0:
            break
        indices.append(idx)
        new_x = np.zeros((n_vocab_size,))
        new_x[idx] = 1.0
        x = np.vstack([x, new_x])
        counter+=1
    
    return indices


def get_sample(model):
    indices = sample_sequence(model)
    name = "".join([ix_to_char[i] for i in indices])
    return name
        

# Model RNN with Masking & Train on Tensor Padded Dataset

In [6]:
X_padded = tf.keras.utils.pad_sequences(x_sequences[:], value=-1.0, padding='post', dtype='float32')
Y_padded = tf.keras.utils.pad_sequences(y_sequences[:], value=-1.0, padding='post', dtype='float32')

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_padded, Y_padded)) \
    .batch(batch_size, drop_remainder=True) \
    .repeat() \
    .prefetch(tf.data.AUTOTUNE) 

kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
recurrent_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
bias_initializer=tf.keras.initializers.Ones() 

inp = tf.keras.Input(shape=(None, n_vocab_size))
x = tfl.Masking(mask_value=-1.)(inp)
rnn_cell = tfl.SimpleRNNCell(
    n_a,
    kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
    recurrent_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
    bias_initializer=tf.keras.initializers.Ones()
)
x = tfl.RNN(rnn_cell, return_sequences=True) (x)
out = tfl.Dense(n_vocab_size, activation="softmax",
                    kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
                    bias_initializer=tf.keras.initializers.Ones())(x)
custom_model = tf.keras.Model(inputs=inp, outputs=out)

# custom_model.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, clipvalue=10.0)
loss_fn = tf.keras.losses.CategoricalCrossentropy()
train_acc = tf.keras.metrics.CategoricalAccuracy()


def train(dataset, epochs):
    steps_per_epoch = X_padded.shape[0] // batch_size
    for epoch in range(epochs):
        num_batches = 0
        total_losses = []
        
        for step, (x_batch, y_batch) in enumerate(dataset):
            if step >= steps_per_epoch:
                break
            
            with tf.GradientTape() as tape:
                predictions = custom_model(x_batch, training=True)
                loss = loss_fn(y_batch, predictions)
                train_acc.update_state(y_batch, predictions)
                
            grads = tape.gradient(loss, custom_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, custom_model.trainable_variables))
            
            total_losses.append(loss)
            num_batches += 1

        epoch_loss = tf.reduce_sum(total_losses)
        print(f"\n\nEpoch {epoch+1} | Loss: {epoch_loss} | Avr Loss: {epoch_loss/num_batches} | Num of Batches: {num_batches} | Accuracy: {train_acc.result().numpy()}")

        for i in range(7):
            name = get_sample(custom_model)
            print(name.title())

train(train_dataset, 15)
custom_model.save("new-dino_RNN_Masking_Batch_Trained.keras")



Epoch 1 | Loss: 133.52357482910156 | Avr Loss: 2.781741142272949 | Num of Batches: 48 | Accuracy: 0.2059769183397293
Oxitncaus
Sanpus
S
Aov
Tisltlaccibyekqals
Ssnducunh
Cnc


Epoch 2 | Loss: 107.12640380859375 | Avr Loss: 2.231800079345703 | Num of Batches: 48 | Accuracy: 0.2855599820613861
Tanyourus
Aurtivihbuemur
Auranydan
Ssinotia
Himusup
Gitninutosausurnurus
Ranipnusiurus


Epoch 3 | Loss: 95.86166381835938 | Avr Loss: 1.9971179962158203 | Num of Batches: 48 | Accuracy: 0.3295495808124542
Tanociiceicus
Rhariloscelicis
Onetetorostodo
Ereeusaurus
Wiclanosandipsaurus
Pinhtnraniur
Ihohtepa


Epoch 4 | Loss: 91.43601989746094 | Avr Loss: 1.9049171209335327 | Num of Batches: 48 | Accuracy: 0.3547587990760803
Enaltaur
Ecenanus
Plhanxis
Pvraxoxus
Fralisaurus
Kiavenymaentavon
Lalerui


Epoch 5 | Loss: 88.9822006225586 | Avr Loss: 1.8537958860397339 | Num of Batches: 48 | Accuracy: 0.37310895323753357
Con
Kegetie
Silichosaurus
Finhinsaurus
Ssinter
Dslopesauruc
Santenpa


Epoch 6 | Loss: 86

## Sampling with RNN batch trained

In [13]:
loaded_model = tf.keras.models.load_model("dino_RNN_Masking_Batch_Trained.keras")

for _ in range(10):
    name = get_sample(loaded_model)
    print(f"name = {name.title()}, len = {len(name)}")

name = Tapusaucasaurus, len = 15
name = Haenyceratops, len = 13
name = Meptceratops, len = 12
name = Udrazogyithus, len = 13
name = Diohenator, len = 10
name = Fursinia, len = 8
name = Nonsygapesauruchus, len = 18
name = Ceviliornphia, len = 13
name = Kolovennasaurus, len = 15
name = Mizangodon, len = 10


# Model RNN with no masking & Train with SGD on raw X, Y

In [11]:
X = indexes_to_one_hot_vectors(inputs[:], n_vocab_size)
Y = indexes_to_one_hot_vectors(outputs[:], n_vocab_size)

print(f"type(X) = {type(X)}, len(X) = {len(X)}")
print(f"type(Y) = {type(Y)}, len(Y) = {len(Y)}")

# for x, y in zip(X, Y):
#     for i in range(0, len(x)-1):
#         all_equal = all(a == b for a, b in zip(x[i+1], y[i]))
#         if not all_equal:
#             raise ValueError(f"x[{i+1}] = {x[i+1]}, y[{i}] = {y[i]}")


kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
recurrent_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
bias_initializer=tf.keras.initializers.Ones() 

inp = tf.keras.Input(shape=(None, n_vocab_size))
rnn_cell = tfl.SimpleRNNCell(
    n_a,
    kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
    recurrent_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
    bias_initializer=tf.keras.initializers.Ones()
)
x = tfl.RNN(rnn_cell, return_sequences=True) (inp)
out = tfl.Dense(n_vocab_size, activation="softmax",
                    kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01),
                    bias_initializer=tf.keras.initializers.Ones())(x)
custom_model = tf.keras.Model(inputs=inp, outputs=out)

# custom_model.summary()

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, clipvalue=5.0)
train_acc = tf.keras.metrics.CategoricalAccuracy()
loss_fn = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM)


def train(X_train, Y_train, num_iterations):
    smoothLoss = -np.log(1.0/n_vocab_size)*7
    print(f"smoothLoss = {smoothLoss}")
    
    for step in range(num_iterations):
        idx = step % len(X_train)
        # x = tf.convert_to_tensor([X_train[idx]])
        # y = tf.convert_to_tensor([Y_train[idx]])
        
        x = np.expand_dims(X_train[idx], axis=1)
        y = np.expand_dims(Y_train[idx], axis=1)

        with tf.GradientTape() as tape:
            predictions = custom_model(x, training=True)
            loss = loss_fn(y, predictions)

            # custom_loss = 0
            # for t, pred in enumerate(predictions):
            #     yt_id = np.argmax(y[t].ravel())
            #     value = tf.reshape(pred, -1)[yt_id]
            #     # print(f"{t}: value = {value}")
            #     custom_loss -= np.log(value) 
            # print(f"custom_loss = {custom_loss}, custom_loss / num = {custom_loss/len(predictions)}")
                
            grads = tape.gradient(loss, custom_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, custom_model.trainable_variables))
            train_acc.update_state(y, predictions)
            
            smoothLoss = smoothLoss * 0.999 + loss * 0.001

            if step % 10_000 == 0 or step == num_iterations-1:
                print(f"\n\nStep: {step+1}, Loss: {loss}, SmoothLoss: {smoothLoss},  Accuracy: {train_acc.result().numpy()}")
                for i in range(7):
                    name = get_sample(custom_model)
                    print(name.title())

train(X, Y, 35000)
custom_model.save("new-dino_RNN_No_masking_SGD_Trained.keras")

type(X) = <class 'list'>, len(X) = 1536
type(Y) = <class 'list'>, len(Y) = 1536
smoothLoss = 23.070858062030304


Step: 1, Loss: 45.90201187133789, SmoothLoss: 23.09368896484375,  Accuracy: 0.1428571492433548
Imfdllhutkoysfqlewpynwk
Danakapnvoxaufwhws
Pbcupntbvdog
Vzhawjlxiyogbhyf
Mutyujosbdsllijfpszodyciwhc
Aee
Ibzt


Step: 10001, Loss: 26.91754913330078, SmoothLoss: 27.82469367980957,  Accuracy: 0.318737268447876
Ojis
Depsaus
Telliaus
Kemiserusiecanas
Erus
Us
Taletosausamaurusalaur


Step: 20001, Loss: 32.59544372558594, SmoothLoss: 27.557518005371094,  Accuracy: 0.3304867148399353
Turuvhoneng
S
Ahaus
Osua
Tevin
Lochausaus
Osaus


Step: 30001, Loss: 31.2158145904541, SmoothLoss: 27.494462966918945,  Accuracy: 0.33551132678985596
S
Rongonosanfxfusosuruhlirus
Phianlanis
Auraus
Kanauceleloscandrur
Tonkulopsangusalinusausaus
Saurururusausbaurohangorunj


Step: 35000, Loss: 29.461299896240234, SmoothLoss: 27.48915672302246,  Accuracy: 0.3371524512767792
Ngoanyvaurhropocrusages
Trosaror
Do

## Sampling with RNN SGD trained

In [12]:
sgd_model = tf.keras.models.load_model("dino_RNN_No_masking_SGD_Trained.keras")

for _ in range(10):
    name = get_sample(sgd_model)
    print(f"name = {name.title()}, len = {len(name)}")

name = S, len = 1
name = Husatraustenos, len = 14
name = Sanes, len = 5
name = Pocatopesenatos, len = 15
name = S, len = 1
name = Los, len = 3
name = S, len = 1
name = Qisaultosalokrusapichupus, len = 25
name = Ylolerus, len = 8
name = Ateysatosas, len = 11
