In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from collections import defaultdict as defaultdict
import pygame
from tqdm import trange

In [2]:
env = gym.make("CliffWalking-v0")

In [3]:
def TDZero(env, runs, gamma, alpha = 0.5):
    policy = np.ones((env.observation_space.n , env.action_space.n))/env.action_space.n
    v = np.zeros((env.observation_space.n))
    step = 0
    for i in trange(runs):
        state, info = env.reset()
        while True:
            step += 1
            action = np.random.choice(env.action_space.n , p = policy[state])
            nextState, reward, done, _, _ = env.step(action)
            
            v[state] += 1/step * (reward + gamma * v[nextState] - v[state])
            
            state = nextState 
            if done:
                break
    return v

In [4]:
V = TDZero(env, 50, 1, 0.5)
print(V)

100%|██████████| 50/50 [00:15<00:00,  3.31it/s]

[-1.16408956e+00 -9.45662592e-01 -5.50944902e-01 -2.93282373e-01
 -1.18187214e-01 -1.08310780e-01 -8.04874220e-02 -7.28516794e-02
 -4.08162201e-02 -1.55512831e-02 -7.75294968e-03 -4.53265295e-03
 -1.96476188e+00 -1.71227780e+00 -7.66297484e-01 -3.78732520e-01
 -1.41473472e-01 -1.40517801e-01 -9.09286713e-02 -4.26375387e-02
 -2.31758879e-02 -1.06627846e-02 -5.34296103e-03 -3.43817842e-03
 -8.64670860e+00 -1.29192297e+01 -6.30504473e+00 -6.00133026e+00
 -1.97044854e+00 -1.68569724e+00 -6.63022548e-01 -5.39464594e-01
 -1.95191085e-01 -1.83574861e-01 -1.26358024e-01 -1.63234700e-03
 -4.79752757e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]





In [5]:
Q = np.zeros((env.observation_space.n, env.action_space.n))
gamma = 1
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        acc = 0.0 
        for prob, next_state, reward, done in env.unwrapped.P[state][action]:
            acc += prob * (reward + gamma * V[next_state])
        Q[state, action] = acc

In [6]:
env = gym.make("CliffWalking-v0", render_mode = "human")
state , info = env.reset()
while True:
    action = np.argmax(Q[state])
    nextState, reward, done, _, _ = env.step(action)
    if done:
        break
    state = nextState

In [7]:
pygame.quit()