In [319]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
import numpy as np

In [321]:
class Policy:
    def __init__(self):
        self.alpha = 0.01
        self.policy = keras.models.Sequential([
            keras.layers.Dense(25, activation="relu", input_shape=[25]),
            keras.layers.Dense(10, activation="relu"),
            keras.layers.Dense(5, activation="softmax"),
        ])
        self.policy.compile(optimizer=Adam(learning_rate=self.alpha))
        
    def __call__(self, state):
        pi = self.policy(state)
        return pi
    
def get_obs(array, pos):
    obs = []
    i, j = pos
    for ii in range(-2, 3, 1):
        row = []
        for jj in range(-2, 3, 1):
            row.append(array[(i + ii) % len(array)][(j + jj) % len(array)])
        obs.append(row)     
    return obs

def apply_action(array, pos, action):
    i, j = pos
    if action == 0:
        array[i][j], array[i][(j + 1) % len(array)] = array[i][(j + 1) % len(array)], array[i][j]
    elif action == 1:
        array[i][j], array[i][(j - 1)] = array[i][(j - 1)], array[i][j]
    elif action == 2:
        array[i][j], array[i - 1][j] = array[i - 1][j], array[i][j]
    elif action == 3:
        array[i][j], array[i + 1][j] = array[(i + 1) % len(array)][j], array[i][j]

In [322]:
import gym
import random

In [323]:
types = [0, 1, 2]

In [316]:
obs = [[random.choice(types) for i in range(5)] for j in range(5)]
obs[2][2] = 1

In [317]:
array = np.array([[random.choice(types) for i in range(10)] for j in range(10)])

In [318]:
array

array([[2, 0, 2, 0, 1, 1, 0, 1, 2, 2],
       [2, 1, 2, 0, 0, 2, 0, 0, 0, 2],
       [1, 1, 0, 2, 0, 0, 0, 0, 0, 1],
       [2, 0, 2, 2, 0, 0, 0, 1, 1, 1],
       [2, 0, 0, 2, 1, 1, 0, 1, 2, 0],
       [1, 2, 2, 2, 0, 0, 2, 1, 0, 1],
       [2, 1, 1, 2, 2, 1, 0, 2, 2, 2],
       [0, 2, 1, 2, 1, 1, 2, 2, 0, 1],
       [0, 2, 2, 1, 1, 1, 1, 2, 0, 1],
       [2, 1, 2, 0, 2, 0, 2, 2, 1, 0]])

## Create Environment

In [324]:
from gym import Env
from gym.spaces import Discrete, Box
import random

In [353]:
class PPEnv(Env):
    def __init__(self, L=10, time_limit=1000):
        self.L = L
        self.action_space = Discrete(5)
        self.observation_space = [Discrete(3)] * 25
        self.types = Discrete(3)
        self.array = np.array([[Discrete(3).sample() for i in range(L)] for j in range(L)])
        self.time = 0
        self.time_limit = time_limit
    
    def get_obs(self, array, pos):
        obs = []
        i, j = pos
        for ii in range(-2, 3, 1):
            row = []
            for jj in range(-2, 3, 1):
                row.append(array[(i + ii) % L][(j + jj) % L])
            obs.append(row)     
        return obs
    
    # Jump
    def apply_action(self, pos, action):
        i, j = pos
        if action == 0:
            self.array[i][j], self.array[i][(j + 1) % L] = self.array[i][(j + 1) % L], self.array[i][j]
        elif action == 1:
            self.array[i][j], self.array[i][j - 1] = self.array[i][j - 1], self.array[i][j]
        elif action == 2:
            self.array[i][j], self.array[i - 1][j] = self.array[i - 1][j], self.array[i][j]
        elif action == 3:
            self.array[i][j], self.array[(i + 1) % L][j] = self.array[(i + 1) % L][j], self.array[i][j]
    
    def get_neighboors(self, pos):
        adress = [
            (i, (j + 1) % L), 
            (i, (j - 1) % L), 
            ((i - 1) % L, j), 
            ((i + 1) % L, j)
        ]
        right, left, up, down = self.array[adress[0][0]][adress[0][1]], self.array[adress[1][0]][adress[1][1]], self.array[adress[2][0]][adress[2][1]], self.array[adress[3][0]][adress[3][1]]
        neighboor = [right, left, up, down]
    
    def step(self, action, pos):
        ngbs = self.get_neighboors(pos)
        obs = self.get_obs(pos, action)
        reward = ngbs.count(2)
        if self.time >= self.time_limit or reward == 4:
            done = True
        else: 
            done = False
        info = {}
        return obs, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        pass

In [354]:
env = PPEnv()

In [355]:
env.observation_space

[Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3),
 Discrete(3)]

In [356]:
Discrete(3).sample()

1

In [357]:
env.array

array([[0, 1, 2, 1, 0, 2, 2, 1, 0, 2],
       [1, 0, 1, 2, 0, 1, 0, 0, 0, 2],
       [2, 2, 2, 2, 0, 0, 1, 2, 0, 0],
       [2, 2, 0, 2, 0, 1, 0, 0, 1, 0],
       [1, 2, 2, 0, 1, 1, 2, 0, 0, 2],
       [0, 1, 1, 1, 0, 0, 0, 0, 1, 0],
       [1, 2, 0, 0, 0, 0, 1, 2, 1, 1],
       [0, 2, 1, 2, 0, 2, 1, 2, 0, 1],
       [2, 0, 2, 2, 2, 1, 1, 0, 2, 1],
       [2, 0, 2, 2, 0, 1, 1, 2, 0, 2]])

In [358]:
lista = [1, 2, 1]


In [359]:
lista.count(1)

2