Install and load all dependencies (first time only) \
NOTE: you may need to restart the runtime afterwards (CTRL+M .).

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!pip install gym
!pip install free-mujoco-py

Set up the custom Hopper environment



1.   Upload `classes.zip` to the current session's file storage
2.   Un-zip it by running cell below


In [None]:
!unzip classes.zip



---



\

**Train an RL agent on the OpenAI Gym Hopper environment using REINFORCE and Actor-critic algorithms**

\


TASK 2 and 3: interleave data collection to policy updates

In [None]:
import argparse

import torch
import gym

from env.custom_hopper import *
from agent import Agent, Policy

In [None]:
n_episodes = 100000
print_every = 20000
device = 'cpu'
algorithm = 'reinforce' # choices=['reinforce', 'reinforce_baseline']

In [None]:
env = gym.make('CustomHopper-source-v0')
# env = gym.make('CustomHopper-target-v0')

print('Action space:', env.action_space)
print('State space:', env.observation_space)
print('Dynamics parameters:', env.get_parameters())

In [None]:
"""
  Training
"""
observation_space_dim = env.observation_space.shape[-1]
action_space_dim = env.action_space.shape[-1]

policy = Policy(observation_space_dim, action_space_dim)
agent = Agent(policy, device=device)

for episode in range(n_episodes):
  done = False
  train_reward = 0
  state = env.reset()  # Reset the environment and observe the initial state

  while not done:  # Loop until the episode is over

    action, action_probabilities = agent.get_action(state)
    previous_state = state

    state, reward, done, info = env.step(action.detach().cpu().numpy())

    agent.store_outcome(previous_state, state, action_probabilities, reward, done)

    train_reward += reward
    
  agent.update_policy(algorithm)  # Update the policy after each episode

  if (episode+1)%print_every == 0:
    print('Training episode:', episode)
    print('Episode return:', train_reward)



torch.save(agent.policy.state_dict(), "model.mdl")