<a href="https://colab.research.google.com/github/verma7/AdventOfCode2021/blob/master/RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RL

In [1]:
!pip install tf-agents[reverb]
!pip install tf-keras

Collecting tf-agents[reverb]
  Downloading tf_agents-0.19.0-py3-none-any.whl.metadata (12 kB)
Collecting gym<=0.23.0,>=0.17.0 (from tf-agents[reverb])
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions==4.5.0 (from tf-agents[reverb])
  Downloading typing_extensions-4.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting pygame==2.1.3 (from tf-agents[reverb])
  Downloading pygame-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting tensorflow-probability~=0.23.0 (from tf-agents[reverb])
  Downloading tensorflow_probability-0.23.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting rlds (from tf-agents[reverb])
  Downloading rlds-0.1.8-py3-n

In [2]:
import os
# Keep using keras-2 (tf-keras) rather than keras-3 (keras).
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [6]:
!pip install galois

Collecting galois
  Downloading galois-0.4.4-py3-none-any.whl.metadata (14 kB)
Downloading galois-0.4.4-py3-none-any.whl (4.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/4.2 MB[0m [31m46.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m75.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: galois
Successfully installed galois-0.4.4


In [7]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np
import random
import galois

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

In [10]:
def get_num_ones(w):
    "Returns a vector containing the number of ones in the bit representation in each element of the GF(2^w) field."
    N = 2 ** w
    GF = galois.GF(N)
    num_ones = np.zeros(N)
    for i in range(N):
        for j in range(w):
            v = np.multiply(GF(i), GF(2 ** j))
            num_ones[i] += bin(v).count("1")
    return num_ones

def count_ones(GC, n, m, ones):
    sum_ones = 0
    for i in range(n):
        for j in range(m):
            sum_ones += ones[int(GC[i][j])]
    return sum_ones

In [62]:
class CauchyGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._W = 3
    self._N = 2 ** self._W
    self._m = 3
    self._GF = galois.GF(self._N)
    self._ONES = get_num_ones(self._W)

    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int8, minimum=0, maximum=2, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(3, self._N, ), dtype=np.int8, minimum=np.full((3, self._N), 0), maximum=np.full((3, self._N), 1), name='observation')
    self._initialize()

  def _good_cauchy_matrix(self):
    GC = np.zeros((self._m, self._m))
    x = 0
    y = 0
    for i in range(self._N):
       if self._state[0][i] == 1:
          y = 0
          for j in range(self._N):
              if self._state[1][j] == 1:
                sum = self._GF(i) + self._GF(j)
                if sum == 0:
                    continue
                GC[x][y] = sum ** -1
                y += 1
          x += 1
    return GC

  def _initialize(self):
    self._vec = [ i for i in range(self._N)]
    random.shuffle(self._vec)
    self._state = np.full((3, self._N), 0)
    for i in self._vec[:self._m]:
        self._state[0][i] = 1
    for i in self._vec[self._m:2*self._m]:
        self._state[1][i] = 1
    for i in self._vec[2*self._m:]:
        self._state[2][i] = 1
    self._GC = self._good_cauchy_matrix()
    self._num_ones = count_ones(self._GC, self._m, self._m, self._ONES)
    self._time_steps = 0

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._initialize()
    return ts.restart(self._state)

  def _swap(self, x, y, nx, ny):
    i = random.randint(0, nx - 1)
    j = random.randint(0, ny - 1)

    ri = 0
    for xi in range(self._N):
      if self._state[x][xi] == 1:
        if ri == i:
          break
        ri += 1
    rj = 0
    for xj in range(self._N):
      if self._state[y][xj] == 1:
        if rj == j:
          break
        rj += 1

    assert self._state[x][xi] == 1
    self._state[x][xi] = 0
    assert self._state[y][xi] == 0
    self._state[y][xi] = 1
    assert self._state[y][xj] == 1
    self._state[y][xj] = 0
    assert self._state[x][xj] == 0
    self._state[x][xj] = 1

  def _step(self, action):
    if self._time_steps == 10:
        return self.reset()

    if action == 0:
      self._swap(0, 1, self._m, self._m)
    elif action == 1:
      self._swap(1, 2, self._m, self._N - 2 * self._m)
    elif action == 2:
      self._swap(2, 0, self._N - 2 * self._m, self._m)
    else:
      raise ValueError('`action` should be 0 or 1 or 2.')

    self._GC = self._good_cauchy_matrix()
    new_num_ones = count_ones(self._GC, self._m, self._m, self._ONES)
    reward = new_num_ones - self._num_ones
    self._num_ones = new_num_ones
    self._time_steps = 0

    if self._time_steps == 10:
      return ts.termination(self._state, reward)
    else:
      return ts.transition(self._state, reward=reward, discount=1.0)

In [63]:
environment = CauchyGameEnv()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward

for _ in range(3):
  time_step = environment.step(0)
  print(time_step)
  cumulative_reward += time_step.reward

time_step = environment.step(1)
print(time_step)
cumulative_reward += time_step.reward
print('Final Reward = ', cumulative_reward)

TimeStep(
{'step_type': array(0, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([[1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0]])})
TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(-7., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([[1, 0, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0]])})
TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([[0, 1, 0, 0, 1, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0]])})
TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(9., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([[0, 0, 0, 0, 1, 0, 1, 1],
       [1, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0]])})
TimeStep(
{'step_type': array(1, dt

In [64]:
env = CauchyGameEnv()
utils.validate_py_environment(env, episodes=5)

ValueError: Given `time_step`: TimeStep(
{'step_type': array(0, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([[0, 0, 1, 1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0]])}) does not match expected `time_step_spec`: TimeStep(
{'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type'),
 'reward': ArraySpec(shape=(), dtype=dtype('float32'), name='reward'),
 'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0),
 'observation': BoundedArraySpec(shape=(3, 8), dtype=dtype('int8'), name='observation', minimum=[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]], maximum=[[1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1]])})