In [1]:
import torch
from torch import nn
from torch import optim
import gym
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

from octopus.nn.policy.rewards import discount_rewards, calculate_discounted_return_an_episode
from octopus.nn.policy.loss import loss_func
from octopus.nn.policy.env import extract_transitions, generate_transitions
from octopus.utils import evaluate_policy_agent, simulate_policy_agent

In [2]:
writer = SummaryWriter(log_dir="../runs/deep_policy")

In [3]:
class DeepPolicy(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.LeakyReLU(),
            # nn.Dropout(),
            # nn.Linear(hidden_size, hidden_size),
            # nn.ReLU(),
            # nn.Dropout(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax(dim=0)
        )
    
    def forward(self, x):
        return self.layers(x)

### Training Loop

In [4]:
env_id = "CartPole-v1"

In [5]:
LEARNING_RATE = 0.009
HIDDE_SIZE = 150

env = gym.make(env_id)
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]


In [6]:
model = DeepPolicy(n_observations, n_actions, HIDDE_SIZE)
optimizier = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [11]:
N_EPISODE = 400
DISCOUNT_FACTOR = 0.99

losses = []
epsiode_len = []

for episode in tqdm(range(N_EPISODE)):
    transitions = generate_transitions(model=model, env=env)

    states, actions, rewards = extract_transitions(transitions)
        
    # convert to torch tensor
    states = torch.tensor(states)
    rewards = torch.tensor(rewards).flip(dims=(0,))
    
    # discounted_rewards = discount_rewards(rewards, discount_factor=DISCOUNT_FACTOR)
    discounted_return = calculate_discounted_return_an_episode(rewards, discount_factor=DISCOUNT_FACTOR)
    discounted_return = torch.tensor([discounted_return]).squeeze()
    
    # make prediction over a batch of states
    predicted_actions = model(states)
    selected_actions = predicted_actions[range(len(predicted_actions)), actions]
    
    loss = loss_func(selected_actions, discounted_return)
    losses.append(loss.detach().numpy())
    
    writer.add_scalar("episode loss", loss.detach().numpy(), episode)
    writer.add_scalar("episode length", len(transitions), episode)
    
    optimizier.zero_grad()
    loss.backward()
    optimizier.step()

  1%|          | 3/400 [01:06<1:55:13, 17.41s/it]

In [11]:
arr = torch.arange(10)

In [12]:
arr.flip(dims=(0,))

tensor([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

### Evaluate Agent

In [10]:
mean_reward, std_reward = evaluate_policy_agent(env, model, 1)
mean_reward, std_reward

100%|██████████| 1/1 [00:00<00:00, 47.74it/s]


(tensor(501.), tensor(nan))

In [9]:
# simulate_policy_agent(env_id=env_id, model=model)

# wrong discount_return: mean_reward = 501

### Upload to HuggingFace

In [45]:
# from huggingface_hub import notebook_login
# from huggingface_sb3 import package_to_hub

In [46]:
# import gym
# from stable_baselines3.common.vec_env import DummyVecEnv
# from stable_baselines3.common.env_util import make_vec_env

# from huggingface_sb3 import package_to_hub

# ## TODO: Define a repo_id
# ## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
# repo_id = "cartpole-vanila-policy-gradient"

# # TODO: Define the name of the environment
# env_id = "CartPole-v1"

# # Create the evaluation env
# eval_env = DummyVecEnv([lambda: gym.make(env_id)])


# # TODO: Define the model architecture we used
# model_architecture = "Policy Gradient"

# ## TODO: Define the commit message
# commit_message = "Beat you all"

# # method save, evaluate, generate a model card and record a replay video of your agent before pushing the repo to the hub
# package_to_hub(model=model, # Our trained model
#                model_name="cartpole-vanila-policy-gradient", # The name of our trained model 
#                model_architecture=model_architecture, # The model architecture we used: in our case PPO
#                env_id=env_id, # Name of the environment
#                eval_env=eval_env, # Evaluation Environment
#                repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
#                commit_message=commit_message)

# # Note: if after running the package_to_hub function and it gives an issue of rebasing, please run the following code
# # cd <path_to_repo> && git add . && git commit -m "Add message" && git pull 
# # And don't forget to do a "git push" at the end to push the change to the hub.