In [111]:
import numpy as np
import random

In [104]:
class Gridworld:

    def __init__(self, shape=(10,10), num_negative_tiles=0, starting_point=(1,1),goal=(8,8),num_walls=20):


        self.size = shape
        self.num_negative_tiles = num_negative_tiles
        self.current_state = np.array(starting_point)
        self.grid = np.zeros(shape, dtype=np.int64)
        self.stochastic_transistions = np.zeros(shape=(shape[0]-1,shape[1]-1), dtype=np.float32)
        self.goal = np.array(goal)
        self.acc_reward = 0 

        # set values for entries in gridworld (rewards, goal, penalties) non-doable tiles should yield -100
        self.grid[goal[0]][goal[1]] = 100


        # setting borders
        self.grid[0,:] = -100
        self.grid[shape[1]-1,:] = -100
        self.grid[:,0] = -100
        self.grid[:,shape[1]-1] = -100



        # implementing random negative tiles 

        
        # check if indexes are unique
        unique_wall_indexes = False
        valid_grid = False
        
        while not unique_wall_indexes:
            random_walls = np.random.randint(1,shape[0]-1,size=(num_walls,2))
            random_walls_unique = np.unique(random_walls,axis=0)
            # check that the indeces neither describe the starting point nor the goal state or are doubled
            if len(random_walls) == len(random_walls_unique) and not goal in random_walls.tolist() and not starting_point in random_walls.tolist():
                unique_wall_indexes = True
        # apply for all unique indexes 
        for r in random_walls:
            self.grid[r[0],r[1]] = -100
        
        valid_grid = self.grid_is_valid()
        
          
    # depth first search to find out if the generated grid world is valid i.e there is a path from start to goal
    def grid_is_valid(self):
        visited = []
        visited = self.dfs(visited, self.current_state)     
        return np.any(np.all(self.goal == visited, axis=1))

    def dfs(self, visited, node):
        visited.append(node)
        neighbors = self.get_neighbors(node)
        for neighbor in neighbors:
            if not np.any(np.all(neighbor == np.array(visited), axis=1)):
            # neighbor is not in visited
                if self.grid[neighbor[0]][neighbor[1]] != -100:
                # neighbor is not a wall
                    self.dfs(visited, neighbor)
        return visited
            
    def get_neighbors(self, node):
        edges = [(1,0), (0,1), (-1,0), (0,-1)]
        neighbors = []
        for edge in edges:
            neighbor = (node[0] + edge[0], node[1] + edge[1])
            neighbors.append(np.array(neighbor))
        return neighbors
        
        
    # reset the actor to starting state
    def reset(self):
        self.current_state = (1,1)
        self.acc_reward = 0



    def step(self,action):
        '''
        Args:
        action(): 0: right, 1: left, 2, up, 3, down
        throws error if move is invalid due to wall
        '''
        
        # anders ugly mit if elif statements, switch erst ab python 3.10
        # right
        if action == 0:
            # get current state 
            y, x = self.current_state
            # check that current state is accessable
            new_y,new_x = y, x+1
            if self.grid[new_y,new_x] != -100:
                # update current state and collect rewward
                self.current_state = ((new_y, new_x))
                self.acc_reward += self.grid[new_y,new_x]
            else:
                raise ValueError('Could not move there due to wall.')


        # left step
        elif action == 1:
            # get current state 
            y, x = self.current_state
            # check that current state is accessable
            new_y,new_x = y, x-1
            if self.grid[new_y,new_x] != -100:
                # update current state and collect rewward
                self.current_state = ((new_y, new_x))
                self.acc_reward += self.grid[new_y,new_x]
            else:
                raise ValueError('Could not move there due to wall.')

        # upwards step
        elif action == 2:
            # get current state 
            y, x = self.current_state
            # check that current state is accessable
            new_y,new_x = y-1, x
            if self.grid[new_y,new_x] != -100:
                # update current state and collect rewward
                self.current_state = ((new_y, new_x))
                self.acc_reward += self.grid[new_y,new_x]
            else:
                raise ValueError('Could not move there due to wall.')
        
        # downwards step
        elif action == 3:
            # get current state 
            y, x = self.current_state
            # check that current state is accessable
            new_y,new_x = y+1, x
            if self.grid[new_y,new_x] != -100:
                # update current state and collect rewward
                self.current_state = ((new_y, new_x))
                self.acc_reward += self.grid[new_y,new_x]
            else:
                raise ValueError('Could not move there due to wall.')
        else:
            raise ValueError('Action index out of bounds. Actions-space = (0,1,2,3)')
    
        
    # print the gridworld as an array 
    def visualize(self):
        print(self.grid)






In [149]:
class SarsAgent:
    def __init__(self, grid_world, state, epsilon=0.9, alpha=0.5, gamma=0.95):
        self.learning_rate = alpha
        self.discount_factor = gamma
        self.epsilon = epsilon
        self.current_state = state
        self.grid_world = grid_world
        self.size = tuple(np.append(np.subtract(grid_world.size, (2,2)), np.array(4)))
        print(self.size)
        self.q_table = np.zeros((self.size), dtype=np.float32)
        
    def get_reward(self,state):
        return self.grid_world.grid[state[0]][state[1]]
    
    def get_valid_actions(self, state):
        actions = []
        grid = self.grid_world
        neighbors = grid.get_neighbors(state)
        for idx, neighbor in enumerate(neighbors):
            if grid.grid[neighbor[0]][neighbor[1]] != -100:
                actions.append(idx)
        return actions
      
        
    def choose_action(self, state):
        # get a list of all actions we can do for the next step
        actions = self.get_valid_actions(state)
        
        # choose random action
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(actions)
        # choose highest q-value action
        else:
            valid_states = []
            for action in actions:
                valid_states.append(q_table[state, action])     
            action = np.argmax(valid_states)
        return action
            
    def learn(self, n_steps):
        self.q_table[self.current_state, action] = self.q_table[self.current_state, action] + self.learning_rate*((reward+self.discount_factor*self.q_table[next_state, next_action])-self.q_table[self.current_state,action])
        Q = self.q_table[self.current_state, action]
        
        reward = self.get_reward(state)
        
        
        target = reward + self.discount_factor**n * q_table(state, action)
        
        reward + self.discount_factor**n * 
        
        state = self.current_state
        for n in range(n_steps):
            reward = self.get_reward(state) * (self.discount_factor** n)
            rewards.append(reward)
            action = choose_action(state)
            state = do_fake_step(action)
        final_q = self.q_table[state, action] * (self.discount_factor** n_steps)
        Q = rewards + final_q - Q
        
    def fake_step(self, action):
        
        return state
            
            
        
    

In [148]:
grid = Gridworld()
#grid.step(0)
valid_states = grid.grid_is_valid()
grid.visualize()
#print(grid.stochastic_transistions)
agent = SarsAgent(grid)
print(f'q_table:\n {agent.q_table}')

[[-100 -100 -100 -100 -100 -100 -100 -100 -100 -100]
 [-100 -100    0 -100    0    0    0 -100    0 -100]
 [-100    0    0    0    0    0 -100 -100 -100 -100]
 [-100    0 -100 -100    0 -100    0 -100    0 -100]
 [-100    0 -100    0    0    0    0    0    0 -100]
 [-100 -100    0    0    0    0    0    0    0 -100]
 [-100 -100    0    0 -100    0 -100 -100 -100 -100]
 [-100    0    0    0    0    0 -100    0 -100 -100]
 [-100    0    0 -100    0    0    0    0  100 -100]
 [-100 -100 -100 -100 -100 -100 -100 -100 -100 -100]]
(8, 8, 4)
q_table:
 [[[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0.

In [106]:
valid_states

False

In [None]:
import tensorflow as tf

In [127]:
x = (2,3)
y = np.array((2,3))
x == y
x = tuple(2,3)
x


TypeError: tuple expected at most 1 arguments, got 2