In [10]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from collections import defaultdict as defaultdict
import pygame
from tqdm import trange

In [11]:
env = gym.make("CliffWalking-v0")

In [12]:
def TDZero(env, runs, gamma, alpha = 0.5):
    policy = np.ones((env.observation_space.n , env.action_space.n))/env.action_space.n
    v = np.zeros((env.observation_space.n))
    step = 0
    for i in trange(runs):
        state, info = env.reset()
        while True:
            step += 1
            action = np.random.choice(env.action_space.n , p = policy[state])
            nextState, reward, done, _, _ = env.step(action)
            
            v[state] += 1/step * (reward + gamma * v[nextState] - v[state])
            
            state = nextState 
            if done:
                break
    return v

In [13]:
V = TDZero(env, 50, 1, 0.5)
print(V)

100%|██████████| 50/50 [00:07<00:00,  6.65it/s]

[-1.11324355e+00 -8.37443990e-01 -5.92106033e-01 -5.22236594e-01
 -2.28432466e-01 -1.20088977e-01 -9.41585945e-02 -6.70279752e-02
 -2.60983770e-02 -1.77520083e-02 -1.42666820e-02 -1.56439483e-02
 -1.71775928e+00 -1.78830058e+00 -7.31260383e-01 -3.98957464e-01
 -2.18771723e-01 -1.00119184e-01 -1.05517719e-01 -6.56496234e-02
 -3.17626319e-02 -2.09515878e-02 -9.65965031e-03 -1.38021886e-02
 -8.00253881e+00 -1.43017400e+01 -7.31023974e+00 -3.04448277e+00
 -1.94234863e+00 -1.50289266e+00 -9.75663687e-01 -4.44515189e-01
 -4.58750898e-01 -1.25355409e+00 -2.66419142e-01 -5.73969835e-03
 -4.41917102e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]





In [14]:
Q = np.zeros((env.observation_space.n, env.action_space.n))
gamma = 1
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        acc = 0.0 
        for prob, next_state, reward, done in env.unwrapped.P[state][action]:
            acc += prob * (reward + gamma * V[next_state])
        Q[state, action] = acc

In [19]:
env = gym.make("CliffWalking-v0", render_mode = "human")
state , info = env.reset()
while True:
    action = np.argmax(Q[state])
    nextState, reward, done, _, _ = env.step(action)
    if done:
        break
    state = nextState

In [20]:
pygame.quit()