In [1]:
import random
import copy
from collections import namedtuple
from dataclasses import dataclass
import datetime
import typing
import functools
from pprint import pprint

import jax
import jax.numpy as jnp
from jax import grad, value_and_grad, jit, vmap
from jax.experimental import optimizers
from jax.experimental import stax
import optax
import haiku as hk
from jax.tree_util import tree_flatten

import pyspiel
import open_spiel
import dm_env
import acme
import acme.wrappers
import acme.jax.utils
import acme.jax.variable_utils
from acme.agents import agent as acme_agent
from acme.agents import replay as acme_replay
from acme.environment_loops.open_spiel_environment_loop import OpenSpielEnvironmentLoop
from acme.wrappers.open_spiel_wrapper import OpenSpielWrapper

from tqdm.notebook import tqdm
import numpy as np
import trueskill

import moozi as mz

In [2]:
seed = 0
key = jax.random.PRNGKey(seed)

In [3]:
raw_env = open_spiel.python.rl_environment.Environment('catch(columns=7,rows=5)')
env = acme.wrappers.open_spiel_wrapper.OpenSpielWrapper(raw_env)
env = acme.wrappers.SinglePrecisionWrapper(env)
env_spec = acme.specs.make_environment_spec(env)
max_game_length = env.environment.environment.game.max_game_length()
dim_action = env_spec.actions.num_values
dim_image = env_spec.observations.observation.shape
dim_repr = 3
print(env_spec)
# mz.utils.print_traj_in_env(env)

EnvironmentSpec(observations=OLT(observation=Array(shape=(35,), dtype=dtype('float32'), name=None), legal_actions=Array(shape=(3,), dtype=dtype('float32'), name=None), terminal=Array(shape=(1,), dtype=dtype('float32'), name=None)), actions=DiscreteArray(shape=(), dtype=int32, name=None, minimum=0, maximum=2, num_values=3), rewards=BoundedArray(shape=(), dtype=dtype('float32'), name=None, minimum=-1.0, maximum=1.0), discounts=BoundedArray(shape=(), dtype=dtype('float32'), name=None, minimum=0.0, maximum=1.0))


In [4]:
env_spec.actions.num_values

3

In [5]:
nn_spec = mz.nn.NeuralNetworkSpec(
    dim_image=dim_image,
    dim_repr=dim_repr,
    dim_action=dim_action
)
print(nn_spec)
network = mz.nn.get_network(nn_spec)
learning_rate = 1e-4
optimizer = optax.adam(learning_rate)

NeuralNetworkSpec(dim_image=(35,), dim_repr=3, dim_action=3, repr_net_sizes=(16, 16), pred_net_sizes=(16, 16), dyna_net_sizes=(16, 16))


In [6]:
batch_size = 16
n_steps=5
reverb_replay = acme_replay.make_reverb_prioritized_nstep_replay(
    env_spec, batch_size=batch_size, n_step=n_steps)

In [7]:
learner = mz.learner.MooZiLearner(
    network=network,
    loss_fn=mz.loss.initial_inference_value_loss,
    optimizer=optimizer,
    data_iterator=reverb_replay.data_iterator,
    random_key=jax.random.PRNGKey(996),
)

In [8]:
key, new_key = jax.random.split(key)
variable_client = acme.jax.variable_utils.VariableClient(learner, None)

In [10]:
key, new_key = jax.random.split(key)
actor = mz.actor.PriorPolicyActor(
    environment_spec=env_spec,
    network=network,
    adder=reverb_replay.adder,
    variable_client=variable_client,
    random_key=new_key,
    epsilon=0.1,
    temperature=1
)

In [11]:
agent = acme_agent.Agent(
    actor=actor,
    learner=learner,
    min_observations=100,
    observations_per_step=1
)

In [12]:
loop = OpenSpielEnvironmentLoop(environment=env, actors=[agent])
loop.run_episode()

AssertionError: 

In [10]:
mode = 'test'

In [14]:
# if mode == 'test':
random_actor = mz.actor.RandomActor(reverb_replay.adder)
loop = OpenSpielEnvironmentLoop(environment=env, actors=[random_actor])
loop.run_episode()

ValueError: Field ('observation', 'observation') has already been set in the active step by previous (partial) append call and thus must be omitted or set to None but got: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]

In [None]:
# num_episodes = 1000
# result = loop.run(num_episodes=num_episodes)

In [None]:
olt = next(reverb_replay.data_iterator).data.observation

In [33]:
olt.legal_actions

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]], dtype=float32)

In [30]:
actor.select_action(olt)

AssertionError: Function _policy_fn is traced > 1 times!

In [26]:
jax.api??

In [22]:
jax.tree_map(lambda x: x + jax.random.normal(key), learner._state.params)

FlatMapping({
  '__neural_network_haiku/~dyna_net/dyna_reward': FlatMapping({
                                                    'b': DeviceArray([-0.48762512], dtype=float32),
                                                    'w': DeviceArray([[-0.52710617],
                                                                      [-0.2306205 ],
                                                                      [-0.81166697],
                                                                      [-0.35118422],
                                                                      [-0.45461407],
                                                                      [-0.5575229 ],
                                                                      [-0.86381185],
                                                                      [-0.12772408],
                                                                      [-0.3639292 ],
                                                         