In [None]:
import gym
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.distributions import Categorical

from tqdm import tqdm, trange

In [None]:
# Goal: learn right setting of weights to maximize the expected rewards
# Want an NN that outputs distribution over actions
class Agent(nn.Module):
    def __init__(self, n_actions, n_states, hidden_size):
        super(Agent, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_states, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax() # b/c we want a distribution over action space
        )

    def forward(self, X):
        return self.net(X)

class Val(nn.Module):
    def __init__(self, n_states, hidden_size, out_size):
        super (Val, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(n_states, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_size)
        )

    def forward(self, X):
        return self.net(X)


class VPG():
    def __init__(self,n_actions, n_states, hidden_size, buffer_size, learning_rate, batch_size, gamma):
        self.agent = Agent(n_actions, n_states, hidden_size)
        self.val = Val(n_states, hidden_size, 1)
        self.gamma = gamma
        self.states= []
        self.rewards=[]
        self.next_states=[]
        self.actions = []
        self.chosen_actions = []
        self.episode_rewards = []
        
        self.optim = torch.optim.Adam(self.agent.parameters(), lr=learning_rate)

    def add_observation(self, state, action, chosen_action, reward, next_state):
        self.states.append(state)
        self.actions.append(action)
        self.chosen_actions.append(chosen_action)
        self.rewards.append(reward)
        self.next_states.append(next_state)

    def get_avg_reward_for_episode(self):
        reward = sum(self.rewards)
        self.episode_rewards.append(reward)
        return reward


    def update(self):
        self.optim.zero_grad()
        rewards_to_go = np.sum([self.gamma**ix * np.array(self.rewards[ix]) for ix, each in enumerate(self.rewards)])
        sampler = Categorical(self.agent(torch.Tensor(self.states)))

        loss = -torch.sum(sampler.log_prob(torch.Tensor(self.chosen_actions)) * rewards_to_go)
        # print("LOSS",loss.shape)
        loss.backward()
        self.optim.step()



    def clear(self):
        self.states = []
        self.rewards = []
        self.next_states = []
        self.actions = []
        self.chosen_actions = []



        
    

In [None]:
n_actions = 4
n_states = 8
hidden_size = 64
buffer_size = 1000000
learning_rate = 0.001
batch_size = 64
gamma = 0.99
model = VPG(n_actions, n_states, hidden_size, buffer_size, learning_rate, batch_size, gamma)


In [None]:

env = gym.make("LunarLander-v2", render_mode="human")
for ep in (pbar:=tqdm(range(1000))): # num. episodes
   observation, info = env.reset(seed=42)
   terminated, truncated = False, False
   while not terminated and not truncated:
      actions = model.agent(torch.Tensor(observation))
      chosen_action = Categorical(actions).sample().item()
      next_observation, reward, terminated, truncated, info = env.step(chosen_action)

      model.add_observation(observation, actions, chosen_action, reward, next_observation)
      observation = next_observation
   ep_reward = model.get_avg_reward_for_episode()
   model.update()
   model.clear()


   pbar.set_description("Ep: {} Reward: {:.3}".format(ep, ep_reward))

   
   


   



      
env.close()

In [None]:
plt.plot(model.episode_rewards)