In [1]:
import torch
from torch import nn 
import numpy as np
from collections import deque
import random
import gym

In [2]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.common=nn.Sequential(
            nn.Linear(8, 48),
            nn.ReLU(),
            nn.Linear(48, 48),
            nn.ReLU()
        )
        self.actor=nn.Sequential(
            nn.Linear(48, 4),
            nn.Softmax(dim=-1)
        )
        
        self.critic=nn.Linear(48, 1)

    def forward(self, x):
        x=self.common(x)
        return self.actor(x), self.critic(x)

In [3]:
def _cuda_tensor(x):
    return torch.tensor(x, dtype=torch.float32, device='cuda')

class Agent:
    def __init__(self):
        self.gamma=.99
        self.model=Model().to('cuda')
        self.optimizer=torch.optim.Adam(self.model.parameters(), 5e-5)
        self.env=gym.make('LunarLander-v2')

    def step(self, i_state:np.ndarray):
        probs_tensor, value=self.model.forward(_cuda_tensor(i_state))
        action=np.random.choice((0, 1, 2, 3), p=probs_tensor.detach().cpu().numpy())
        f_state, reward, done, _=self.env.step(action)
        with torch.no_grad():
            _, next_state_value=self.model.forward(_cuda_tensor(f_state))
        td_error=reward+self.gamma*next_state_value*(1-float(done))-value
        ln_prob=torch.log(probs_tensor)[action]
        loss=td_error**2+ln_prob
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.env.render('rgb_array')
        if not done:
            try:
                self.step(f_state)
            except:
                return

    def play(self):
        self.step(self.env.reset())

In [4]:
agent=Agent()

In [5]:
for episode in range(1500):
    agent.play()

