In [12]:
pip install gymnasium[toy-text]

Defaulting to user installation because normal site-packages is not writeable
Collecting pygame>=2.1.3 (from gymnasium[toy-text])
  Downloading pygame-2.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m840.5 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.6.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random

In [5]:
env = gym.make('MountainCar-v0')

In [6]:
env.observation_space.dtype

dtype('float32')

In [7]:

# Discretization function
def discretize_state(state, bins):
    position, velocity = state
    position_bin = np.digitize(position, bins['position']) - 1
    velocity_bin = np.digitize(velocity, bins['velocity']) - 1
    return position_bin, velocity_bin

# Create bins for discretization
def create_bins(num_bins):
    position_bins = np.linspace(-1.2, 0.6, num_bins)
    velocity_bins = np.linspace(-0.07, 0.07, num_bins)
    return {'position': position_bins, 'velocity': velocity_bins}


In [15]:

    num_bins = 10
    bins = create_bins(num_bins)
bins


{'position': array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6]),
 'velocity': array([-0.07      , -0.05444444, -0.03888889, -0.02333333, -0.00777778,
         0.00777778,  0.02333333,  0.03888889,  0.05444444,  0.07      ])}

In [41]:
def policy_eval(env, policy, value_function, bins, gamma):
    value_function = np.zeros((len(bins['position']), len(bins['velocity'])))
    while True:
        delta = 0
        for position_bin in range(len(bins['position'])-1):
            for velocity_bin in range(len(bins['velocity'])-1):
                state = (position_bin, velocity_bin)
                v = 0
                for action in range(env.action_space.n):
                    state=env.reset()
                    next_state,reward,terminated,truncated,info=env.step(action)
                    next_position_bin, next_velocity_bin = discretize_state(next_state, bins)
                    v += policy[position_bin, velocity_bin,action] * (reward + gamma * value_function[next_position_bin,next_velocity_bin])
                value_function[position_bin, velocity_bin] = v
                delta = max(delta, abs(v - value_function[position_bin, velocity_bin]))
        if delta < 1e-5:
            break
    return value_function

In [42]:

def policy_improvement(env,value_function,policy,bins,gamma):
    
    policy_stable = False
    while not policy_stable:
        policy_stable = True
           
        for position_bin in range(len(bins['position'])-1):
            for velocity_bin in range(len(bins['velocity'])-1):
                state = (position_bin, velocity_bin)
                old_action = np.argmax(policy[state])
                action_values = []
                for action in range(env.action_space.n):
                    env.reset()
                    next_state, reward, terminated, truncated, info = env.step(action)
                    next_position_bin, next_velocity_bin = discretize_state(next_state, bins)
                    action_values.append(reward + gamma * value_function[next_position_bin][next_velocity_bin])
                new_action = np.argmax(action_values)
                policy[state] = np.eye(env.action_space.n)[new_action]
                if old_action != new_action:
                    policy_stable = False
    
    return policy,policy_stable 
                    
                

In [43]:


def policy_iteration(env, bins, gamma=0.9):
    # Initialize value function and policy
    value_function = np.zeros((len(bins['position'])-1, len(bins['velocity'])-1))
    policy = np.ones((len(bins['position'])-1, len(bins['velocity'])-1, env.action_space.n)) / env.action_space.n
    while True:
        # Policy Evaluation
        value_function = policy_eval(env, policy, value_function, bins, gamma)
        
        # Policy Improvement
        policy, policy_stable = policy_improvement(env, value_function, policy, bins, gamma)
        
        if policy_stable:
            break
    
    return policy, value_function
    

In [45]:
gamma = 0.99  # Discount factor
policy, value_function = policy_iteration(env, bins, gamma)
policy

array([[[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

