<a href="https://colab.research.google.com/github/vbipin/Awesome-model-compression-and-acceleration/blob/master/GridSoc_single_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random
import itertools #for some itertools.product(a,b)
#
#We plan to implement the Gridworld Socccer 
#

In [0]:
#Let us have a gridworld
#ref: Chapter 17, Artificial Intelligence a Modern Approach
#ref: CS188 https://inst.eecs.berkeley.edu/~cs188/fa19/
#ref: https://inst.eecs.berkeley.edu/~cs188/fa19/assets/slides/lec8.pdf


#We play Soccer in a grid world.
#In this version we have only one player and one ball.

#Action is a tuple of (ball action, player action)
#both actions are applied simultaneously

class GridSoc :
    def __init__(self, 
                 rows    = 6, 
                 columns = 6, 
                 walls   = [], 
                 terminals     = {(2,0):+10.0, (3,0):+10.0}, 
                 transitions   = None,
                 gamma         = 1.0, 
                 living_reward = -0.04,
                 noise         = 0.2
                ) :
        """We dont expect these parameters to change during the agent run"""
        
        #first the action space
        #Action is a tuple of (ball action, player action)
        #both actions are applied simultaneously

        #Actions #The tuples are just row and column offsets for that action.
        self.soc_player_actions = {
        #Player Movements
        'MoveUp'    : (-1, 0),
        'MoveDown'  : (1,  0),
        'MoveRight' : (0,  1),
        'MoveLeft'  : (0, -1),
        'Stay'      : (0, 0),
        'Goal'      : (0, 0), #dummy action for Exit
        }

        #### Ball movements
        self.soc_ball_actions = {  
        'KickUp'    : (-1, 0),
        'KickDown'  : (1,  0),
        'KickRight' : (0,  1),
        'KickLeft'  : (0, -1),

        'KickUp2'    : (-2, 0),
        'KickDown2'  : (2, 0),
        'KickRight2' : (0, 2),
        'KickLeft2'  : (0, -2),

        'KickRightUp'    : (-1,  1),
        'KickRightDown'  : (1,   1),
        'KickLeftUp'     : (-1,  -1),
        'KickLeftDown'   : (1, -1),

        'KickRightUp2'    : (-2, 2),
        'KickRightDown2'  : (2,  2),
        'KickLeftUp2'     : (-2, -2),
        'KickLeftDown2'   : (2,-2),
        'Stay'            : (0, 0),
        'Goal'            : (0, 0),
        }

        
        self.rows      = rows
        self.columns   = columns
        self.N         = rows * columns #total cells
        self.walls     = walls
        self.terminals = terminals #dictionary of terminal celss and their rewards.
        self.gamma     = gamma
        
        #These are actually costs
        self.living_reward = living_reward
        self.ball_reward   = living_reward *3 #if ball action is played; It has a higher penalty
        self.player_reward = living_reward *2 #if player action is played; It has a higher penalty
        
        self.all_actions   = list(itertools.product(self.soc_ball_actions.keys(), self.soc_player_actions.keys()))
        self.end_state     = ((-1,-1), (-1, -1)) #a dummy state to reach after taking Exit
        self.valid_cells   = [(r,c) for r in range(rows) for c in range(columns) if (r,c) not in walls]
        self.all_states    = list(itertools.product(self.valid_cells, self.valid_cells)) + [self.end_state]
        self.noise         = noise
        
        #XXX for debugging info
        self.debug = {}
        
        #transitions from each state and the probabilities
        self.noise                = noise
    
    def actions(self, state) :
        """In Soc all actions are posible. But some actions result in NoOperation"""
        if state == self.end_state :
            return [] #No action available.
        return self.all_actions
    
    #XXX We will see
    def reward(self, state, action, next_state=None) :
        """reward is the instantaneous reward. It is usually R(s,a,s')"""
        #In grid world the reward depends only on state.
        ball_cell, actor_cell = state
        ball_action, player_action = action
        
        if ball_cell in self.terminals and player_action == 'Goal':
            return self.terminals[ball_cell] #dict has the terminal values +1 or -1
            
        if state == self.end_state :
            return 0.0
        
        r = self.living_reward
        if ball_action != 'Stay' :
            r += self.ball_reward #usually a cost
            
        if player_action != 'Stay' :
            r += self.player_reward#usually a cost
        
        return r  #usually a small -ve value
    
    #XXX
    def transitions(self, state, action) :
        """returns list of tuple(nextstate, action, probability)"""
        #In this version we keep everything with probability 1
        probs = 1.0
        return [self._next_state(state, action)], [action], [probs]
        #pass #Not implemented now.
    
    def move(self, state, action) :
        """Take the action and return the tuple(new_state, reward, is_terminal)"""
        #here state is a 2tuple, cell of (Ball, Player)
        
        new_state   = self._next_state(state, action)
        reward      = self.reward(state, action) #
        
        is_terminal = False
        if new_state == self.end_state :
            is_terminal = True
            
        return new_state, reward, is_terminal #keep the same for mat as OpenAI gym.
    
    def _next_state(self, state, action) : 
        """takes the action and returns the next state"""
        ball_cell, player_cell = state      
        ball_target, player_target = state #we change this
        ball_action, player_action = action
        
        assert action in self.actions(state)
        
        if ball_cell in self.terminals : #Doent matter which action; any action will do.
            return self.end_state
        
        
        player_target = self._move_player(player_cell, player_action)
        
        #rest of the stuff we can do only if actor is at the ball location
        if ball_cell == player_cell:
            ball_target = self._kick_ball(ball_cell, ball_action) 
           
        if self._valid_cell(player_target) and self._valid_cell(ball_target):
            return ball_target, player_target
        
        return state #stay put. the target is invalid.
    
    def _valid_cell(self, cell) :
        """Returns true if the cell is a valid cell"""
        if cell in self.valid_cells :
            return True
        return False
    
    def _move_player(self, player_cell, action) :
        coord = self.soc_player_actions[action]
        player_target = player_cell[0] + coord[0], player_cell[1] + coord[1]
        return player_target
    
    def _kick_ball(self, ball_cell, action) :
        """Kicks the ball and return the target cell"""        
        #we designed the actions as tuples of increments.
        coord = self.soc_ball_actions[action]
        ball_target = ball_cell[0] + coord[0], ball_cell[1] + coord[1]
        return ball_target 

    
    #pretty print the grid and agent if given.
    def print(self, state=None) :
        if state:
            ball_cell, agent_cell = state
        else :
            ball_cell, agent_cell = None, None
            
        for r in range(self.rows) :
            for c in range(self.columns) :
                cell = (r,c)
                if cell in self.walls :
                    print('#', end=' ')
                elif cell == agent_cell :
                    print('@', end=' ')
                elif cell == ball_cell :
                    print('O', end=' ')
                elif cell in self.terminals :
                    if self.terminals[cell] > 0 :
                        print('+', end=' ')
                    else :
                        print('-', end=' ')
                
                else :
                    print('. ', end='')
            print("")
        print("--" * self.columns)

In [4]:
gs = GridSoc(rows=5, columns=5)

#Action is a tuple of (ball action, player action)
#both actions are applied simultaneously

start = ((0,1), (0,3))
action = ('Stay', 'MoveLeft')
n,r,d = gs.move(start,action)
print(r, n)
gs.print(n)


action = ('Stay', 'MoveLeft')
n,r,d = gs.move(n,action)
print(r, n)
gs.print(n)

action = ('KickDown2', 'MoveDown')
n,r,d = gs.move(n,action)
print(r, n)
gs.print(n)

action = ('Stay', 'MoveDown')
n,r,d = gs.move(n,action)
print(r, n)
gs.print(n)

action = ('KickLeft', 'Stay')
n,r,d = gs.move(n,action)
print(r, n)
gs.print(n)

action = 'Stay', 'Goal'
n,r,d = gs.move(n,action)
print(r, n)
if not d:
    gs.print(n)

-0.12 ((0, 1), (0, 2))
. O @ . . 
. . . . . 
+ . . . . 
+ . . . . 
. . . . . 
----------
-0.12 ((0, 1), (0, 1))
. @ . . . 
. . . . . 
+ . . . . 
+ . . . . 
. . . . . 
----------
-0.24 ((2, 1), (1, 1))
. . . . . 
. @ . . . 
+ O . . . 
+ . . . . 
. . . . . 
----------
-0.12 ((2, 1), (2, 1))
. . . . . 
. . . . . 
+ @ . . . 
+ . . . . 
. . . . . 
----------
-0.16 ((2, 0), (2, 1))
. . . . . 
. . . . . 
O @ . . . 
+ . . . . 
. . . . . 
----------
10.0 ((-1, -1), (-1, -1))
