In [14]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from collections import defaultdict as defaultdict
import pygame
from tqdm import trange

In [15]:
env = gym.make("CliffWalking-v0")

In [16]:
def nTD(env, n, runs, gamma = 1, alpha = 0.5):
    policy = np.ones((env.observation_space.n, env.action_space.n))/env.action_space.n
    v = np.zeros((env.observation_space.n))
    
    totalstep = 0
    for i in trange(runs):
        state, info = env.reset()
        R = [0]
        S = [state]
        step = 0

        while True:
            step += 1
            totalstep += 1
            
            action = np.random.choice(env.action_space.n , p = policy[state])
            nextState, reward, done, _, _ = env.step(action)
            
            R.append(reward)
            S.append(nextState)
            
            if step >= n:
                tempReward = R[step - n : step]
                gamma_ = [gamma ** i for i in range(n)]
                    
                G = np.dot(tempReward, gamma_)
                G += (gamma ** n) * v[S[step]]
                
                v[S[step - n]] += 1/totalstep * (G - v[S[step - n]])
                
            if done:
                for i in range(max(0, step - n + 1), step):
                    tempEpisode = R[i + 1 : step + 1]
                    gamma_ = [gamma ** i for i in range(len(tempEpisode))]
                    
                    G = np.dot(tempEpisode, gamma_)
                    v[S[i]] += 1/totalstep * (G - v[S[i]])
                break
            state = nextState 
            
    return v

In [17]:
V = nTD(env, 20, 50, 1, 0.5)
print(V)

100%|██████████| 50/50 [00:12<00:00,  3.99it/s]

[-114.22071335 -101.7093928   -81.67571623  -52.19097836  -37.07483867
  -27.27467772  -32.05971512  -18.01387435  -10.23672396   -5.78869562
  -21.58243172   -6.39819562 -160.69978006 -114.50153033  -80.72342315
  -49.31732846  -39.50080875  -32.67198499  -26.12046681  -10.80355276
   -9.96172537   -8.77264288   -7.65995644   -0.99364855 -268.5795557
 -160.34873762  -85.6747507   -49.67510266  -24.06640839  -12.22547718
  -17.61908633   -5.10297351   -2.86302324   -1.60976549   -1.48196941
   -0.58065365 -447.95896557    0.            0.            0.
    0.            0.            0.            0.            0.
    0.            0.            0.        ]





In [18]:
Q = np.zeros((env.observation_space.n, env.action_space.n))
gamma = 1
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        acc = 0.0 
        for prob, next_state, reward, done in env.unwrapped.P[state][action]:
            acc += prob * (reward + gamma * V[next_state])
        Q[state, action] = acc 

In [19]:
env = gym.make("CliffWalking-v0", render_mode = "human")
state , info = env.reset()
while True:
    action = np.argmax(Q[state])
    nextState, reward, done, _, _ = env.step(action)
    if done:
        break
    state = nextState

In [20]:
pygame.quit()