# PPO for Cart Pole and Mountain Car

# Introduction
This notebook contains the PPO agent used in the paper "Go-Explore with a guide: Speeding up search in sparse reward domains with goal-directed intrinsic rewards"

The PPO agent was implemented using Stable-Baselines 3, and this notebook was modified from one of Stable-Baselines 3's tutorial notebooks.

Github repo: https://github.com/araffin/rl-tutorial-jnrr19

Stable-Baselines3: https://github.com/DLR-RM/stable-baselines3

It also contains the 2 continuous state environments used:
- Cart Pole
- Mountain Car

In [None]:
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install stable-baselines3[extra]

In [None]:
import stable_baselines3
stable_baselines3.__version__

## Imports

Stable-Baselines works on environments that follow the [gym interface](https://stable-baselines.readthedocs.io/en/master/guide/custom_env.html).
You can find a list of available environment [here](https://gym.openai.com/envs/#classic_control).

It is also recommended to check the [source code](https://github.com/openai/gym) to learn more about the observation and action space of each env, as gym does not have a proper documentation.
Not all algorithms can work with all action spaces, you can find more in this [recap table](https://stable-baselines.readthedocs.io/en/master/guide/algos.html)

In [None]:
import gym
import numpy as np

The first thing you need to import is the RL model, check the documentation to know what you can use on which problem

In [None]:
from stable_baselines3 import PPO
import torch

The next thing you need to import is the policy class that will be used to create the networks (for the policy/value functions).
This step is optional as you can directly use strings in the constructor: 

```PPO('MlpPolicy', env)``` instead of ```PPO(MlpPolicy, env)```

Note that some algorithms like `SAC` have their own `MlpPolicy`, that's why using string for the policy is the recommened option.

In [None]:
# from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
import copy

# Helper functions
- FirstSolve: Determine first solve run of an environment after running model
- ManyRuns: Compute the first solve run across a few evaluation trials

In [None]:
def FirstSolve(env, seed = None):
  model = None
  model = PPO('MlpPolicy', copy.deepcopy(env), verbose=0)
  
  # set the random seed
  if seed is not None:
    np.random.seed(seed)
    torch.manual_seed(seed)

  # Keep training the agent for 200 steps, then evaluate
  solved = False
  for i in range(100):
    model.learn(total_timesteps=200)
    mean_reward, std_reward = evaluate_policy(model, copy.deepcopy(env), n_eval_episodes=1)
    # print(f"Run {i+1}: mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    if mean_reward == 1: 
      solved = True
      break

  if solved:
    print(f'Solved at Run {i+1}')
    return i+1
  else:
    print('Unsolved')
    return -1

In [None]:
def ManyRuns(env, trials=5):
  solvedlist = []
  for i in range(trials):
    trial_num = FirstSolve(copy.deepcopy(env), i)
    if trial_num > 0:
      solvedlist.append(run_num)

  if len(solvedlist) > 0:
    avgsolve = sum(solvedlist)/len(solvedlist)
    print(f'Average solved run number is {avgsolve}')
    print(*solvedlist, sep = ' & ', end = ' & '+str(avgsolve))

# Cart Pole

In [None]:
class CartPoleEnv:
    metadata = {'render.modes': ['human']}
    def __init__(self, env_name = 'CartPole-v0', goal_steps = 50, start_state = None, normalizer = np.array([0.01, 0.001, 0.01, 0.001]), cap = np.array([42])):
        self.name = env_name
        self.numsteps = 0
        self.reward = 0
        self.done = False
        self.start_state = start_state
        self.goal_steps = goal_steps
        self.normalizer = normalizer
        self.cap = cap
        self.env = gym.make(env_name)
        self.env.reset()
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        if self.start_state is not None:
            self.env.unwrapped.state = self.start_state
      
    def render(self):
        return self.env.render()
            
    def reset(self):
        self.env.reset()
        if self.start_state is not None:
            self.env.unwrapped.state = self.start_state
        self.reward = 0
        self.done = False
        self.numsteps = 0
        return self.staterep()
        
    # returns state representation
    def staterep(self):
        if self.normalizer is not None:
            return ((self.env.state)//self.normalizer).clip(-self.cap, self.cap)
        else:
            return self.env.state
            
    # gets a valid move
    def getvalidmoves(self):
        return list(range(self.env.action_space.n))
    
    def step(self, move):
        if self.done:
            return self.staterep(), self.reward, self.done, {}
    
        validmoves = self.getvalidmoves()
        # randomly sample a move if not in validmoves
        if move not in validmoves:
            move = validmoves[np.random.randint(len(validmoves))]
        self.numsteps += 1
        
        # do your move
        state, reward, done, _ = self.env.step(move)
        
        # only if done at step 175 is considered success
        if done:
            self.done = True
            if self.numsteps > self.goal_steps:
                self.reward = 1
            else:
                self.reward = 0
            self.env.close()

        return self.staterep(), self.reward, done, {}
    
    def sample(self):
        return np.random.choice(self.getvalidmoves())
    
    def print(self):
        print(self.env.state)

## Discrete states

In [None]:
TRIAL_NUM = 10 # this tells us how many times to evaluate FirstSolve

In [None]:
env = CartPoleEnv(goal_steps = 50, start_state = np.array([0, 0, 0, 0]))
ManyRuns(env, TRIAL_NUM)

In [None]:
env = CartPoleEnv(goal_steps = 100, start_state = np.array([0, 0, 0, 0]))
ManyRuns(env, TRIAL_NUM)

In [None]:
env = CartPoleEnv(goal_steps = 175, start_state = np.array([0, 0, 0, 0]))
ManyRuns(env, TRIAL_NUM)

## Continuous States

In [None]:
env = CartPoleEnv(goal_steps = 50, start_state = np.array([0, 0, 0, 0]), normalizer = None)
ManyRuns(env, TRIAL_NUM)

In [None]:
env = CartPoleEnv(goal_steps = 100, start_state = np.array([0, 0, 0, 0]), normalizer = None)
ManyRuns(env, TRIAL_NUM)

In [None]:
env = CartPoleEnv(goal_steps = 175, start_state = np.array([0, 0, 0, 0]), normalizer = None)
ManyRuns(env, TRIAL_NUM)

# Mountain Car

In [None]:
class MountainCarEnv:
    metadata = {'render.modes': ['human']}
    def __init__(self, env_name = 'MountainCar-v0', start_state = None, normalizer = np.array([0.01, 0.001]), cap = np.array([50])):
        self.env_name = env_name
        self.numsteps = 0
        self.reward = 0
        self.done = False
        self.start_state = start_state
        self.normalizer = normalizer
        self.cap = cap
        self.env = gym.make(env_name)
        self.env.reset()
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        if self.start_state is not None:
            self.env.unwrapped.state = self.start_state
        
    def render(self):
        return self.env.render()

    def reset(self):
        self.env.reset()
        if self.start_state is not None:
            self.env.unwrapped.state = self.start_state
        self.reward = 0
        self.done = False
        self.numsteps = 0
        return self.staterep()
        
    # returns state representation
    def staterep(self):
        if self.normalizer is not None:
            return ((self.env.state)//self.normalizer).clip(-self.cap, self.cap)
        else:
            return self.env.state
            
    # gets a valid move
    def getvalidmoves(self):
        return list(range(self.env.action_space.n))
    
    def step(self, move):
        if self.done:
            return self.staterep(), self.reward, self.done, {}
    
        validmoves = self.getvalidmoves()
        # randomly sample a move if not in validmoves
        if move not in validmoves:
            move = validmoves[np.random.randint(len(validmoves))]
        self.numsteps += 1
        
        # do your move
        state, reward, done, _ = self.env.step(move)
        
        # only if past x=0.5, then it is success
        if done:
            self.done = True
            if self.env.state[0] >= 0.5:
                self.reward = 1
            else:
                self.reward = 0
            self.env.close()

        return self.staterep(), self.reward, done, {}
        
    
    def sample(self):
        return np.random.choice(self.getvalidmoves())
    
    def print(self):
        print(self.env.state)

## Discrete State

In [None]:
env = MountainCarEnv(start_state = np.array([-0.5, 0]))
ManyRuns(env, TRIAL_NUM)

## Continuous State

In [None]:
env = MountainCarEnv(start_state = np.array([-0.5, 0]), normalizer = None)
ManyRuns(env, TRIAL_NUM)