<a href="https://colab.research.google.com/github/vbipin/aip/blob/master/mdp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#we plan to implement some of the algorithms related to MDPs and RL
#MDP study
#%matplotlib inline
#import matplotlib
#import numpy as np
#import matplotlib.pyplot as plt

#I am trying to avoid the numpy dependencies

import random
#
#We plan to implement the gridworld class 
#


In [0]:
#ref: Chapter 17, Artificial Intelligence a Modern Approach
#ref: CS188 https://inst.eecs.berkeley.edu/~cs188/fa19/
#ref: https://inst.eecs.berkeley.edu/~cs188/fa19/assets/slides/lec8.pdf

#This class will create a 2D grid of row x colums 
#Some of the cells can be disabled by putting it into walls
#cells are addressed just like 2d arrays (r,c)
#There are possibly many terminal states
#terminal states have only one action available: Exit 
#Transistion is as per the book 80% action and 20% sideways

#Actions #just some alias
Up    = 0
Right = 1
Down  = 2
Left  = 3
Exit  = 4

class GridWorld :
    def __init__(self, rows=3, columns=4, walls=[(1,1)], terminals= {(0,3):+1.0, (1,3):-1.0}, gamma=1.0, living_reward=0 ) :
        """We dont expect these parameters to change during the agent run"""
        self.rows      = rows
        self.columns   = columns
        self.N         = rows * columns #total cells
        self.walls     = walls
        self.terminals = terminals #dictionary of terminal celss and their rewards.
        self.gamma     = gamma
        self.living_reward = living_reward
        self.all_actions   = [ Up, Down, Right, Left, Exit ]
        self.all_states    = [(r,c) for r in range(rows) for c in range(columns) if (r,c) not in walls ]
  
    #pretty print the grid and agent if given.
    def print(self, agent_state=None) :
        for r in range(self.rows) :
            for c in range(self.columns) :
                cell = (r,c)
                if cell in self.walls :
                    print('# ', end='')
                elif cell in self.terminals :
                    if self.terminals[cell] > 0 :
                        print('+', end=' ')
                    else :
                        print('-', end=' ')
                elif cell == agent_state :
                    print('@ ', end='')
                else :
                    print('. ', end='')
            print("")
                
        
#Let me try this design where the agent holds the GridWorld
class GridWorldAgent :
    
    def __init__(self, state=(0,0), grid_world=None ) : 
        self.grid_world  = grid_world #the common world
        #state is the agent position
        self.state       = state
        self.all_actions = grid_world.all_actions
      
    def actions(self) :
        """returns all valid actions from the current state"""
        if self._is_terminal() :
            return [Exit]
        return [ Up, Down, Right, Left ]
    
    def children(self) :
        """returns a list of tuple of (new_node, reward, is_terminal?)"""
        return [self.move(a) for a in self.actions()]
    
    def move(self, action) :
        """Take the action and return the tuple(new_state, reward, is_terminal)""" 
        
        if action == Exit :
            new_node    = None
            reward      = self.grid_world.terminals[self.state] #terminals got the rewards as well
            is_terminal = True
            
        else :
            #we find the new_cell of the slippery action according to the prbabilities
            #but this cell may be invalid. If invalid, we stay in the current state.
            cell   = self._do_action(self._slipery(action))
            if self._valid_cell(cell) :
                new_node    = GridWorldAgent( state=cell, grid_world=self.grid_world )
            else : #cannot move; we stay at the same place
                new_node    = GridWorldAgent( state=self.state, grid_world=self.grid_world )
                
            reward      = self.grid_world.living_reward
            is_terminal = False
            
        return new_node, reward, is_terminal #keep the same for mat as OpenAI gym.
    
    ##### Internal functions. Should not be called from outside the class ########
        
    def _do_action(self, action) : 
        """Blindly takes the action without checking anything and returns the position"""
        r,c = self.state #row & column
        if action == Up :
            return r-1, c  
        if action == Down :
            return r+1, c
        if action == Right :
            return r, c+1  
        if action == Left :
            return r, c-1 
        return None #should not reach here
    
    def _slipery(self, action) :
        """try to take the action according to transition probabilities"""
        transitions = { Up:[Up, Left, Right], Down:[Down,Left,Right], Left:[Left,Up,Down], Right:[Right,Up, Down]}
        #a = np.random.choice(transitions[action],1, [0.8,0.1,0.1]) #choose an index(0-2) according to probability
        a = random.choices( transitions[action], weights=[0.8,0.1,0.1])[0] #only one; we take index 0
        return a
            
    def _valid_cell(self, cell) :
        """Returns true if the cell is a valid cell"""
        r, c = cell #this may be an illegal node; we need to check
        
        #is it any of the walls?
        if (r,c) in self.grid_world.walls :
            return False
        
        #is it outside the grid?
        if r < 0 or r >= self.grid_world.rows or c < 0 or c >= self.grid_world.columns :
            return False
        
        return True
    
    def _is_terminal(self) :
        """To check if this is a terminal state; We havent yet taken Exit"""
        return self.state in self.grid_world.terminals #Returns true if the agent is a terminal
    
    def print(self) :
        # + and - are the terminal states. @ is our agent.
        self.grid_world.print(self.state)

In [0]:
grid_world = GridWorld(gamma=0.9, living_reward=-0.04)
start = (2,0) #as in the book
a = GridWorldAgent( state=start, grid_world=grid_world )

In [4]:
# + and - are the terminal states. @ is our agent.
a.print()

. . . + 
. # . - 
@ . . . 


In [0]:
############# I might change the API later ##########


In [0]:
class Policy :
    def __init__(self, grid_world=None) :
        """Holds one policy and returns actions according to it"""
        self.grid_world = grid_world
        self.policy     = { } #{ state: policy_action}
        
    def __getitem__(self, state) :
        return self.policy[state]
    
    def __setitem__(self, state, action) :
        self.policy[state] = action

In [0]:
###### Some test code. 

In [0]:
def random_policy(grid_world=None) :
    """returns a random action function"""
    def f(agent) :
        actions = agent.actions()
        return actions[random.randint(0, len(actions)-1)]
    return f

def fixed_policy(grid_world) :
    p = Policy()
    p.policy = {(r,c):Up for r in range(grid_world.rows) for c in range(grid_world.columns) 
                if (r,c) not in grid_world.terminals}
    p.policy.update({a:Exit for a in grid_world.terminals})
    #print(p.policy)
    def f(agent) :
        return p[agent.state]
    return f

def good_policy(grid_world) :
    p = Policy()
    p.policy = {
        (0,0):Right, (0,1): Right, (0,2): Right, (0,3) : Exit,
        (1,0):Up,    (1,1): Right, (1,2): Up,    (1,3) : Exit,
        (2,0):Up,    (2,1): Left,  (2,2): Up,    (2,3) : Left,
               }
    p.policy.update({a:Exit for a in grid_world.terminals})
    #print(p.policy)
    def f(agent) :
        return p[agent.state]
    return f
    
def run(agent, policy=None) :
    """runs a full episode and return the total reward"""
    rewards = []
    a = agent
    gamma = agent.grid_world.gamma
    time_step = 0
    while True :
        action = policy(a)
        #a.print()
        #print(action)
        a, r, exited = a.move(action)
        rewards.append(r * (gamma**time_step) ) #the further we go down, the less we value the reward
        if exited :
            break
    
    time_step += 1
    return rewards


def expected_utility(agent, policy, N=100) :
    """run the policy till completion several times and return the expected utility"""
    s = 0.0
    for _ in range(N) :
        #from the same start state we run till completion, N times
        s += sum( run(a, policy) )
    return s/N

In [8]:
#page  651; AIMA Book
for cell in grid_world.all_states :
    a.state = cell
    print (expected_utility(a, good_policy(grid_world), 10000))

0.8095760000001052
0.8707560000000645
0.9181279999999582
1.0
0.7603760000001115
0.6568960000000649
-1.0
0.7062280000001091
0.6543200000000846
0.5820160000000824
0.366540000000036
