In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from rlgym.policy import Policy

plt.style.use("bmh")

In [3]:
## DISCRETE ENV
# env_name = "CartPole-v1"
# env_name = "LunarLander-v2"
## CONTINUOUS ENV
# env_name = "Pendulum-v1"
env_name = "LunarLanderContinuous-v2"
# env_name = "MountainCarContinuous-v0"

env = gym.make(env_name)
n_episode = 1500
log_every_n = n_episode / 100
x = np.arange(0, n_episode, log_every_n)


def train(algo,
          learning_rate,
          hidden_size,
          number_of_layers,
          is_shared_network=True):

    obversation_space = env.observation_space.shape[0]
    action_space = env.action_space

    policy = Policy(algo, obversation_space, action_space, learning_rate,
                    hidden_size, number_of_layers, is_shared_network)

    log_rewards = []

    for t in tqdm(range(n_episode)):

        current_state, _ = env.reset()
        next_state = None
        states, actions, next_states, rewards, flags, logprobs = [], [], [], [], [], []
        terminated, truncated = False, False

        while not terminated and not truncated:
            action, log_prob = policy.get_action(current_state)

            if policy.is_continuous:
                action = np.array(action, ndmin=1)

            next_state, reward, terminated, truncated, _ = env.step(action)

            states.append(current_state)
            actions.append(action)
            next_states.append(next_state)
            rewards.append(reward)
            flags.append(int(terminated))
            logprobs.append(log_prob)

            current_state = next_state

        policy.update_policy({
            "states":
            torch.from_numpy(np.array(states)).to(torch.device("cuda")),
            "actions":
            torch.from_numpy(np.array(actions)).to(torch.device("cuda")),
            "next_states":
            torch.from_numpy(np.array(next_states)).to(torch.device("cuda")),
            "rewards":
            torch.from_numpy(np.array(rewards)).to(torch.device("cuda")),
            "flags":
            torch.from_numpy(np.array(flags)).to(torch.device("cuda")),
            "logprobs":
            torch.stack(logprobs).squeeze()
        })

        if not t % log_every_n:
            log_rewards.append(np.sum(rewards))

    # policy.save("model-" + env_name + "-" + algo + ".pt")

    return log_rewards

DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`

---

# Reinforce


In [None]:
log_rewards = train(algo="reinforce",
                    learning_rate=3e-3,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()


In [None]:
log_rewards = train(algo="reinforce",
                    learning_rate=1e-3,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

In [None]:
log_rewards = train(algo="reinforce",
                    learning_rate=1e-4,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

---

# A2C


In [None]:
log_rewards = train(algo="a2c",
                    learning_rate=3e-3,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

In [None]:
log_rewards = train(algo="a2c",
                    learning_rate=1e-3,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

In [None]:
log_rewards = train(algo="a2c",
                    learning_rate=3e-4,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

---

# PPO


In [None]:
log_rewards = train(algo="ppo",
                    learning_rate=1e-3,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

In [None]:
log_rewards = train(algo="ppo",
                    learning_rate=3e-4,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()

In [None]:
log_rewards = train(algo="ppo",
                    learning_rate=1e-4,
                    hidden_size=128,
                    number_of_layers=3)

plt.plot(x, log_rewards)
plt.show()