In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [None]:
# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

In [None]:
batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [2]:
def custom_catcross(y_true, y_pred):
    return tf.math.reduce_mean(
        - tf.math.reduce_sum(
            y_true * tf.math.log(y_pred), axis=1
        )
    )

In [None]:
batch_size = 128
epochs = 15

model.compile(loss=custom_catcross, optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [3]:
from tensorflow.keras.layers import Convolution2D

In [None]:
Convolution2D()

In [9]:
y_true = [[0., 1., 0.], [0., 0., 1.]]
y_pred = [[0.04, 0.95, 0.01], [0.1, 0.8, 0.1]]
# Using 'auto'/'sum_over_batch_size' reduction type.
cce = tf.keras.losses.CategoricalCrossentropy()
cce(y_true, y_pred).numpy()

1.1769392

In [10]:
custom_catcross(y_true, y_pred).numpy()

1.1769392

In [None]:
tf.math.reduce_sum(
    custom_catcross(y_true, y_pred)
)

In [None]:
tf.math.reduce_sum(y_true, axis=1)

In [5]:
#def custom_loss_function(y_true, y_pred):
def custom_loss_function(reward, action_prob):
    loss = - K.log(action_prob) * reward
    #loss = action_prob * reward
    loss = K.mean(loss)
    return loss

In [11]:
custom_loss_function(y_true, y_pred).numpy()

0.3923131

In [16]:
-(y_true * tf.math.log(y_pred)).numpy()

array([[0.        , 0.05129331, 0.        ],
       [0.        , 0.        , 2.3025851 ]], dtype=float32)

In [23]:
- K.log(y_pred) * y_true

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.        , 0.05129331, 0.        ],
       [0.        , 0.        , 2.3025851 ]], dtype=float32)>

In [5]:
import math

math.log(0.99) * 0.5

-0.005025167926750725

In [3]:
math.log(0.01) * 15

-69.07755278982137

In [8]:
def format_rewards(action_history, reward_history, action_space=2):
    formated_ah = utils.to_categorical(action_history, num_classes=action_space)
    formated_rw = np.full((action_space, reward_history.shape[0]), reward_history).T
    return formated_ah * formated_rw

def compute_discounted_reward(reward_history, discount_rate=0.99):
    discounted_rewards = []
    discounted_sum = 0
    for r in reward_history[::-1]:
        discounted_sum = r + discount_rate * discounted_sum
        discounted_rewards.insert(0, discounted_sum)
        
    # use simple Baseline
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)
    
    return discounted_rewards

In [6]:
ah = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
rh = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [19]:
import tensorflow.keras.utils as utils
format_rewards(ah, compute_discounted_reward(rh))

array([[ 1.56662532,  0.        ],
       [ 0.        ,  1.29126974],
       [ 1.01313279,  0.        ],
       [ 0.        ,  0.73218638],
       [ 0.44840212,  0.        ],
       [ 0.        ,  0.16175136],
       [-0.12779487, -0.        ],
       [-0.        , -0.4202658 ],
       [-0.71569099, -0.        ],
       [-0.        , -1.01410027],
       [-1.31552378, -0.        ],
       [-0.        , -1.61999198]])

In [16]:
def discount_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards)
    running_add = 0
    for t in reversed(range(0, len(rewards))):
        if rewards[t] != 0:
            running_add = 0
        running_add = running_add * 0.99 + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

discount_rewards(rh)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
advantages = np.zeros((len(ah), 2))
discounted_rewards = discount_rewards(rh)

for i in range(len(ah)):
    advantages[i][ah[i]] = discounted_rewards[i]

advantages

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [None]:
update_inputs = np.zeros(((episode_length,) + self.state_size)) # Episode_lengthx64x64x4
# Similar to one-hot target but the "1" is replaced by discounted_rewards R_t
advantages = np.zeros((episode_length, self.action_size))

# Episode length is like the minibatch size in DQN
for i in range(episode_length):
    update_inputs[i,:,:,:] = self.states[i]
    advantages[i][self.actions[i]] = discounted_rewards[i]

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
