In [14]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from collections import deque

In [4]:
def run_episode(env, agent):
    done = False
    obs = env.reset()

    while not done:
        env.render()
        if agent is None:
            action = env.action_space.sample()
        else:
            obs = torch.from_numpy(obs).float()
            action = agent.get_action(obs)
            
        next_obs, reward, done, _ = env.step(action)
        obs = next_obs
        

In [5]:
agent = Agent(env, 2, 4)
for i in range(5):
    run_episode(env, agent)
    print(f'Episode {i} done')
env.close()

NameError: name 'Agent' is not defined

In [6]:
env.close()

In [7]:
def rewards_to_go(rewards, discount_factor=0.99):
    # from https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html
    
    r2g = []
    discounted_reward = 0
    
    for reward in reversed(rewards):
        discounted_reward = reward + discount_factor * discounted_reward
        r2g.insert(0, discounted_reward)
    
    return torch.tensor(r2g, dtype=torch.float)

In [8]:
def compute_advantage(rewards, state_values):
    advantages = rewards - state_values
    return normalize(advantages)

In [9]:
def normalize(x):
    return (x - x.mean(0)) / (x.std(0) + 1e-7)

In [54]:
class Trajectory(torch.utils.data.Dataset):
    def __init__(self):
        self.states = []
        self.log_probs = []
        self.actions = []
        self.rewards = []
        self.dones = []

    def __len__(self):
        return len(self.states)
    
    def convert_rewards_to_go(self, discount_factor=0.99):
        # from https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html
        r2g = []
        discounted_reward = 0

        for reward in reversed(self.rewards):
            discounted_reward = reward + discount_factor * discounted_reward
            r2g.insert(0, discounted_reward)

        self.rewards = r2g

    def store_timestep(self, state, action, reward, done, log_prob):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)
        self.log_probs.append(log_prob)
        
    def clear_memory(self):
        self.states = []
        self.log_probs = []
        self.state_values = []
        self.actions = []
        self.rewards = []
        self.dones = []
    
    def __getitem__(self, index):
        state = self.states[index]
        action = self.actions[index]
        reward = self.rewards[index]
        done = self.dones[index]
        log_prob = self.log_probs[index]
        state_value = self.state_values[index]
        
        return state, action, reward, done, log_prob, state_value

In [55]:
class Agent:
    def __init__(self, env, action_dim, state_dim, batch_size):
        self.env = env
        self.actor = Policy(action_dim, state_dim)
        self.trajectories = deque(maxlen=5)
        self.discount_factor = 0.99
        
    def get_action(self, obs):
        action_probs = self.actor(obs)
        action_dist = Categorical(logits=action_probs)
        action = action_dist.sample()
        
        return action.item(), action_dist.log_prob(action)
    
    def train(self):
        pass
    
    def run_episode(self, render=False):
        trajectory = Trajectory()
        
        done = False
        obs = self.env.reset()
        
        while not done:
            if render:
                self.env.render()

            obs = torch.from_numpy(obs).float()
            action, log_prob = agent.get_action(obs)
            next_obs, reward, done, _ = self.env.step(action)
            
            trajectory.store_timestep(obs, action, reward, done, log_prob)
            obs = next_obs
            
        trajectory.convert_rewards_to_go()
        self.trajectories.append(trajectory)
        

In [56]:
class Policy(nn.Module):
    def __init__(self, action_dim, state_dim):
        super(Policy, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(state_dim, 100),
            nn.ReLU()
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(100, action_dim)
        )        
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [57]:
env = gym.make("CartPole-v0")

In [58]:
agent = Agent(env, 2,4,16)

In [59]:
agent.run_episode()

In [62]:
agent.trajectories[0].rewards

[32.42709509397166,
 31.744540498961268,
 31.05509141309219,
 30.358678195042614,
 29.655230500043047,
 28.944677272770754,
 28.22694674017248,
 27.501966404214627,
 26.76966303456023,
 26.02996266117195,
 25.282790566840355,
 24.528071279636723,
 23.76572856528962,
 22.995685419484467,
 22.21786406008532,
 21.4321859192781,
 20.638571635634445,
 19.8369410460954,
 19.027213177874142,
 18.209306240276913,
 17.383137616441328,
 16.54862385499124,
 15.705680661607312,
 14.854222890512437,
 13.994164535871148,
 13.12541872310217,
 12.247897700103202,
 11.361512828387072,
 10.466174574128356,
 9.561792499119552,
 8.64827525163591,
 7.72553055720799,
 6.793465209301,
 5.8519850599,
 4.90099501,
 3.9403989999999998,
 2.9701,
 1.99,
 1.0]