In [302]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MSE
from tensorflow.keras.regularizers import L2

Function Tests

In [303]:
from config import *
from game import *
from main import *
from mcts import *
from self_play import *
from networks import *
from networks_base import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### main

In [361]:
network = CartPoleNetwork(
    action_size=2, state_shape=(None, 4), embedding_size=4, max_value=200)
config = get_cartpole_config(50)  # Create Environment
env = gym.make('CartPole-v0')

# Create buffer to store games
replay_buffer = ReplayBuffer(config)
#self_play(env, config, replay_buffer, network)

In [362]:
print(config.__dict__)

{'action_space_size': 2, 'games_per_epoch': 2, 'num_epochs': 50, 'train_per_epoch': 30, 'episodes_per_test': 10, 'visit_softmax_temperature_fn': <function visit_softmax_temperature at 0x000001D7A0E6E3A0>, 'max_moves': 200, 'num_simulations': 50, 'discount': 0.997, 'root_dirichlet_alpha': 0.1, 'root_exploration_fraction': 0.25, 'pb_c_base': 19652, 'pb_c_init': 1.25, 'known_bounds': None, 'buffer_size': 200, 'batch_size': 512, 'num_unroll_steps': 5, 'td_steps': 10, 'lr_init': 0.01}


In [363]:
print(config.root_exploration_fraction, np.random.dirichlet([config.root_dirichlet_alpha]*2))

0.25 [0.5100251 0.4899749]


### Self_play

In [364]:
# config.py
min_max_stats= MinMaxStats(config.known_bounds)

In [365]:
start_state = env.reset()
# Create Game Objects
game = Game(config.action_space_size, config.discount, start_state)
root = Node(0)
game.curr_state

array([-0.00363736, -0.02077954,  0.00602416,  0.04816066], dtype=float32)

### MCTS

In [366]:
root = Node(0)
value = expand_root(root, list(range(config.action_space_size)),
                    network, current_state=game.curr_state)

In [367]:
backpropagate([root], value, config.discount, min_max_stats)

In [368]:
add_exploration_noise(config, root)

In [370]:
def run_mcts(config, root, network, min_max_stats):
    """
    Main loop for MCTS for config.num_simulations simulations

    root: the root node
    network: the network
    min_max_stats: the min max stats object for the simulation

    Hint:
    The MCTS should capture selection, expansion and backpropagation
    """
    for i in range(2):
        history = []
        node = root
        search_path = [node]  #  node object

        while node.expanded:
            action, node = select_child(config, node, min_max_stats)
            history.append(action)
            search_path.append(node)
        print(f"History:{history}, {node.value_sum}")
        parent = search_path[-2]
        action = history[-1]
        value = expand_node(node, list(range(config.action_space_size)),
                            network, parent.hidden_representation, action)
        
        backpropagate(search_path, value,
                      config.discount, min_max_stats)
        print(f"{node.value_sum}")
        ##
run_mcts(config, root, network, min_max_stats)

History:[1], 0
70.53214201288333
History:[1, 1], 0
70.21464903408993


In [371]:
games_played = 0
action = select_action(config, games_played, root, network)
print(action)

Select action| visit counts:[(0, 0), (2, 1)]
1


### self-Play, network

In [372]:
returns = 0
game = play_game(config, network, env, games_played)
replay_buffer.save_game(game)
returns += sum(game.reward_history)
print(returns)

Select action| visit counts:[(5, 0), (45, 1)]
Select action| visit counts:[(7, 0), (43, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Select action| visit counts:[(27, 0), (23, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Select action| visit counts:[(4, 0), (46, 1)]
Total reward for game: 9.0
9.0


In [373]:
batch = replay_buffer.sample_batch()
(state_batch, targets_init_batch, targets_recurrent_batch, actions_batch) = batch
h, v, pi_logits = network.initial_model(np.stack(state_batch))
target_value_batch, _, target_policy_batch = zip(*targets_init_batch)
target_value_batch = network._scalar_to_support(
            tf.convert_to_tensor(target_value_batch))
print(v.shape, target_value_batch.shape)

(512, 16) (512, 16)


In [286]:
for actions_batch_, targets_batch_ in zip(actions_batch, targets_recurrent_batch):
    target_value_batch, target_reward_batch, target_policy_batch = zip(
                *targets_batch_)
    print(type(target_value_batch), type(target_reward_batch), type(target_policy_batch))
    break
target_value_batch = tf.convert_to_tensor(target_value_batch)
target_value_batch = network._scalar_to_support(target_value_batch)

target_policy_batch = tf.convert_to_tensor(target_policy_batch)
target_reward_batch = tf.convert_to_tensor(target_reward_batch)
print(len(actions_batch_), len(target_value_batch), len(target_reward_batch), len(target_policy_batch))

# game.state_history

<class 'tuple'> <class 'tuple'> <class 'tuple'>
512 512 512 512


In [282]:
regress_critierion = tf.keras.losses.MeanSquaredError()
ce_criterion = tf.nn.softmax_cross_entropy_with_logits
# loss
v_loss = tf.math.reduce_mean(scale_gradient(ce_criterion(target_value_batch, v), 1/4))
r_loss = regress_critierion(target_reward_batch, r)
pi_loss = tf.math.reduce_mean(ce_criterion(target_policy_batch, pi_logits))
print(v_loss, r_loss, pi_loss)

tf.Tensor(1.1814256, shape=(), dtype=float32) tf.Tensor(0.7892106, shape=(), dtype=float32) tf.Tensor(0.6429982, shape=(), dtype=float32)


In [178]:
#network._conditioned_hidden_state(h[0:1], actions_batch_[0])

In [287]:
h_con = np.apply_along_axis(lambda x: tf.squeeze(network._conditioned_hidden_state(np.expand_dims(x[:-1], 0), x[-1]), 0), 
                    axis=1, arr=tf.concat((h, np.expand_dims(np.array(actions_batch_),1)), axis=1))

In [288]:
h, r, v, pi_logits = network.recurrent_model(h_con)
print(h.shape, r.shape, v.shape, pi_logits.shape)

(512, 4) (512, 1) (512, 16) (512, 2)


In [296]:
train_results = TrainResults()
optimizer = Adam(learning_rate=config.lr_init)
update_weights(config, network, optimizer, replay_buffer.sample_batch(), train_results)

Loss:7.308935642242432


### Self-play full

In [298]:
optimizer = Adam(learning_rate=config.lr_init)
games_played = 0
test_rewards = TestResults()
train_results = TrainResults()
for i in range(1):  # Number of Steps of train/play alternations
    print(f"Epoch Number {i}")
    for _ in range(1):
        game = play_game(config, network, env, games_played)
        replay_buffer.save_game(game)
        returns += sum(game.reward_history)
        games_played += 1
    print("Train score:", returns/1)
    for _ in range(1):
        batch = replay_buffer.sample_batch()
        update_weights(config, network, optimizer, batch, train_results)

Epoch Number 0
Select action| visit counts:[(37, 0), (13, 1)]
Select action| visit counts:[(9, 0), (41, 1)]
Select action| visit counts:[(9, 0), (41, 1)]
Select action| visit counts:[(37, 0), (13, 1)]
Select action| visit counts:[(9, 0), (41, 1)]
Select action| visit counts:[(9, 0), (41, 1)]
Select action| visit counts:[(9, 0), (41, 1)]
Select action| visit counts:[(13, 0), (37, 1)]
Select action| visit counts:[(35, 0), (15, 1)]
Select action| visit counts:[(7, 0), (43, 1)]
Select action| visit counts:[(7, 0), (43, 1)]
Select action| visit counts:[(34, 0), (16, 1)]
Select action| visit counts:[(35, 0), (15, 1)]
Select action| visit counts:[(7, 0), (43, 1)]
Select action| visit counts:[(7, 0), (43, 1)]
Total reward for game: 15.0
Train score: 41.0


IndexError: index 16 is out of bounds for axis 1 with size 16