In [22]:
import gymnasium as gym
import numpy as np
import torch
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.distributions import Categorical
from tqdm import tqdm
from torch.optim import Adam

In [58]:
network = nn.Sequential(*[nn.Linear(8, 30), 
                          nn.ReLU(),
                          nn.Linear(30, 40),
                          nn.ReLU(),
                          nn.Linear(40, 4),
                          nn.Softmax(-1),
                         ]); ### This dude is the policy function
state_value = nn.Sequential(*[nn.Linear(8, 40), 
                          nn.GELU(),
                          nn.Linear(40, 30),
                          nn.GELU(),
                          nn.Linear(30, 1)
                             ]) ### This dude is the state value

In [59]:
@torch.no_grad
def choose_action(state:np.ndarray, network:nn.Module = network)->int:
    probs = network(torch.tensor(state))
    probs = Categorical(probs).sample()
    return probs.item()

In [2]:
env = gym.make("LunarLander-v2", 
               gravity = -10.0,
               enable_wind = True,
    wind_power = 2.0)
γ = 0.99

opt1 = Adam(network.parameters(), 0.0001)
opt2 = Adam(state_value.parameters(), 0.001)

num_iters = tqdm(range(1500))

for i in num_iters:
    last_state, info = env.reset()
    I = 1
    cum_reward = 0
    cum_delta = 0
    for c in range(4000):
        
        action = choose_action(last_state, network)  # agent policy that uses the observation and info
        current_state, reward, terminated, truncated, info = env.step(action)
        

        network.train()
        state_value.train()
        with torch.no_grad():

            if terminated or truncated:
                δ = reward - state_value(torch.tensor(last_state))
            else:
                δ = reward + γ*state_value(torch.tensor(current_state)) - state_value(torch.tensor(last_state))
#Kalpazanlar andreaj jeet               
        cum_reward = (0.9)*cum_reward+(0.1)*(reward)
        cum_delta = (0.9)*cum_delta+(0.1)*(δ)
                
        opt1.zero_grad()
        opt2.zero_grad()
        
        
        z = -δ*state_value(torch.tensor(last_state))
        z.backward()
        z_ = -δ*I*network(torch.tensor(last_state)).squeeze()[action].log()
        z_.backward()

        opt1.step()
        opt2.step()
        
        
        """ 
       Old school update
        with torch.no_grad():
            for param in state_value.parameters():
                param /= torch.linalg.norm(param)
                param += α*δ*param.grad
        
            for param in network.parameters():
                param /= torch.linalg.norm(param)
                param += α*δ*I*param.grad
        """   
        I = γ*I
        
        if terminated or truncated:
            break
        else:
            last_state = current_state
    #
    
    num_iters.set_description(f"{cum_reward, cum_delta.item()}")
    
env.close()       
    #update_params(network, state_value,states, actions, rewards)
    #update_params(network, state_value,states, actions, rewards)

NameError: name 'gym' is not defined

In [63]:
env = gym.make("LunarLander-v2", 
               max_episode_steps = 700,  
               gravity = -10.0,
               enable_wind = False,
               wind_power =1.0,
               render_mode = "human"
              )

state, info = env.reset()
γ = 0.99

for c in range(700):
    network.eval()
    action = choose_action(state, network)  # agent policy that uses the observation and info
    state, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        state, info = env.reset()
        break
env.close()

In [None]:
TT