In [91]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from collections import defaultdict as defaultdict
import pygame
from tqdm import trange

In [92]:
env = gym.make("CliffWalking-v0")

In [93]:
def nTD(env, n, runs, gamma = 1, alpha = 0.5):
    policy = np.ones((env.observation_space.n, env.action_space.n))/env.action_space.n
    v = np.zeros((env.observation_space.n))
    
    totalstep = 0
    for i in trange(runs):
        state, info = env.reset()
        R = [0]
        S = [state]
        step = 0

        while True:
            step += 1
            totalstep += 1
            
            action = np.random.choice(env.action_space.n , p = policy[state])
            nextState, reward, done, _, _ = env.step(action)
            
            R.append(reward)
            S.append(nextState)
            
            if step >= n:
                tempReward = R[step - n : step]
                gamma_ = [gamma ** i for i in range(n)]
                    
                G = np.dot(tempReward, gamma_)
                G += (gamma ** n) * v[S[step]]
                
                v[S[step - n]] += 1/totalstep * (G - v[S[step - n]])
                
            if done:
                for i in range(max(0, step - n + 1), step):
                    tempEpisode = R[i + 1 : step + 1]
                    gamma_ = [gamma ** i for i in range(len(tempEpisode))]
                    
                    G = np.dot(tempEpisode, gamma_)
                    v[S[i]] += 1/totalstep * (G - v[S[i]])
                break
            state = nextState 
            
    return v

In [94]:
V = nTD(env, 20, 50, 1, 0.5)
print(V)

100%|██████████| 50/50 [00:19<00:00,  2.56it/s]

[-1.07498136e+02 -8.49735727e+01 -6.30668479e+01 -5.18425866e+01
 -4.62566447e+01 -3.48551695e+01 -1.57959663e+01 -8.32942082e+00
 -4.66735489e+00 -1.95422536e+00 -1.12940009e+00 -7.39684377e-01
 -1.48957758e+02 -1.11840429e+02 -7.94569106e+01 -6.84336925e+01
 -5.70932944e+01 -3.95445126e+01 -2.26185261e+01 -1.49521963e+01
 -6.83550295e+00 -2.02729735e+00 -1.25407403e+00 -4.86997798e-01
 -2.32148681e+02 -1.33344084e+02 -7.03115822e+01 -5.46047616e+01
 -3.51377657e+01 -2.36125810e+01 -1.31364380e+01 -6.10948073e+00
 -3.30029833e+00 -2.45188557e+00 -9.06164958e-01 -1.93855956e-01
 -4.10183568e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]





In [95]:
Q = np.zeros((env.observation_space.n, env.action_space.n))
gamma = 1
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        acc = 0.0 
        for prob, next_state, reward, done in env.unwrapped.P[state][action]:
            acc += prob * (reward + gamma * V[next_state])
        Q[state, action] = acc 

In [96]:
env = gym.make("CliffWalking-v0", render_mode = "human")
state , info = env.reset()
while True:
    action = np.argmax(Q[state])
    nextState, reward, done, _, _ = env.step(action)
    if done:
        break
    state = nextState

In [97]:
pygame.quit()