# Reinforcement Learning with Function Approximation

In [0]:
!pip install torch==1.2.0 torchvision pyvirtualdisplay matplotlib seaborn pandas numpy pathlib gym
!sudo apt-get install xvfb
!git clone https://github.com/rlgammazero/mvarl_hands_on.git > /dev/null 2>&1
!cd mvarl_hands_on && git pull origin master > /dev/null 2>&1

In [0]:
!apt-get install x11-utils
from pathlib import Path
import base64

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical 
from torch import optim

import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import math

import gym
from gym.wrappers import Monitor
from pprint import pprint
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output

# The following code is will be used to visualize the environments.

def show_video(directory):
    html = []
    for mp4 in Path(directory).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
    
display = Display(visible=0, size=(1400, 900))
display.start();

def make_seed(seed):
    np.random.seed(seed=seed)
    torch.manual_seed(seed=seed)

PyTorch is a Python package that provides two high-level features:
- Tensor computation (like NumPy) with strong GPU acceleration
- Deep neural networks built on a tape-based autograd system

At a granular level, PyTorch is a library that consists of the following components:

| Component | Description |
| ---- | --- |
| [**torch**](https://pytorch.org/docs/stable/torch.html) | a Tensor library like NumPy, with strong GPU support |
| [**torch.autograd**](https://pytorch.org/docs/stable/autograd.html) | a tape-based automatic differentiation library that supports all differentiable Tensor operations in torch |
| [**torch.jit**](https://pytorch.org/docs/stable/jit.html) | a compilation stack (TorchScript) to create serializable and optimizable models from PyTorch code  |
| [**torch.nn**](https://pytorch.org/docs/stable/nn.html) | a neural networks library deeply integrated with autograd designed for maximum flexibility |
| [**torch.multiprocessing**](https://pytorch.org/docs/stable/multiprocessing.html) | Python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and Hogwild training |
| [**torch.utils**](https://pytorch.org/docs/stable/data.html) | DataLoader and other utility functions for convenience |


**Tutorials on PyTorch:** https://pytorch.org/tutorials/


## OpenAI gym
We will consider environments provided by OpenAI gym
This library provides a large number of environments to test RL algorithm.

We will focus on the **CartPole-v1** environment in this lab but we encourage you to also test your code on:
* **Acrobot-v1**
* **MountainCar-v0**

| Env Info          	| CartPole-v1 	| Acrobot-v1                	| MountainCar-v0 	|
|-------------------	|-------------	|---------------------------	|----------------	|
| **Observation Space** 	| Box(4)      	| Box(6)                    	| Box(2)         	|
| **Action Space**      	| Discrete(2) 	| Discrete(3)               	| Discrete(3)    	|
| **Rewards**           	| 1 per step  	| -1 if not terminal else 0 	| -1 per step    	|

A gym environment is loaded with the command `env = gym.make(env_id)`. Once the environment is created, you need to reset it with `observation = env.reset()` and then you can interact with it using the method step: `observation, reward, done, info = env.step(action)`.

In [0]:
# We load CartPole-v1
env = gym.make('CartPole-v1')
# We wrap it in order to save our experiment on a file.
env = Monitor(env, "./gym-results", force=True)

In [0]:
done = False
obs = env.reset()
while not done:
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
env.close()
show_video("./gym-results")

## REINFORCE

**Q1: Implement the REINFORCE algorithm**

The code is splitted in two parts:
* The Model class defines the architecture of our neural network which takes as input the current state and returns the policy,
* The Agent class is responsible for the training and evaluation procedure. You will need to code the method `optimize_model`.

In [0]:
class Model(nn.Module):
    def __init__(self, dim_observation, n_actions):
        super(Model, self).__init__()
        
        self.n_actions = n_actions
        self.dim_observation = dim_observation
        
        self.net = nn.Sequential(
            nn.Linear(in_features=self.dim_observation, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=8),
            nn.ReLU(),
            nn.Linear(in_features=8, out_features=self.n_actions),
            nn.Softmax(dim=0)
        )
        
    def forward(self, state):
        return self.net(state)
    
    def select_action(self, state):
        probs = self.forward(torch.tensor(state, dtype=torch.float32))
        c = Categorical(probs)
        action = c.sample()
        log_proba = c.log_prob(action)
        return action, log_proba

Create the model based on the properties of the MDP you want to solve

In [0]:
env_id = 'CartPole-v1'
env = gym.make(env_id)
model = Model(env.observation_space.shape[0], env.action_space.n)
print(f'The model we created correspond to:\n{model}')

We provide a base agent that you will need to extend in the next cell with your implementation of `optimize_model`.

In [0]:
class BaseAgent:
    
    def __init__(self, config):
        self.config = config
        self.env = gym.make(config['env_id'])
        make_seed(config['seed'])
        self.env.seed(config['seed'])
        self.model = Model(self.env.observation_space.shape[0], self.env.action_space.n)
        self.gamma = config['gamma']
        
        # the optimizer used by PyTorch (Stochastic Gradient, Adagrad, Adam, etc.)
        self.optimizer = torch.optim.Adam(self.model.net.parameters(), lr=config['learning_rate'])
        self.monitor_env = Monitor(env, "./gym-results", force=True, video_callable=lambda episode: True)
    
    # Method to implement
    def compute_G_t(self, rewards):
        """Returns the cumulative discounted rewards at each time step

        Parameters
        ----------
        rewards : array
            The array of rewards of one episode

        Returns
        -------
        array
            The cumulative discounted rewards at each time step
            
        Example
        -------
        for rewards=[1, 2, 3] this method outputs [1 + 2 * gamma + 3 * gamma**2, 2 + 3 * gamma, 3] 
        """
        
        N = len(rewards)
        returns = np.zeros(N)
        for i in range(N):
            for j in range(i,N):
                returns[i] += rewards[j]*(self.gamma)**(j-i)
        return returns
    
    # Method to implement
    def optimize_model(self, n_trajectories):
        """Perform a gradient update using n_trajectories

        Parameters
        ----------
        n_trajectories : int
            The number of trajectories used to approximate the expectation card(D) in the formula above
        
        Returns
        -------
        array
            The cumulative discounted rewards of each trajectory
        """
        
        G_t = []
        batch_log_probs = []
        global_rewards = []
        
        for i in range(n_trajectories):
            rewards = []
            log_probs = []
            done = False
            observation = self.env.reset()
            while not done:
                action, log_proba = self.model.select_action(observation)
                next_observation, reward, done, info = self.env.step(action.item())
                rewards.append(reward)
                log_probs.append(log_proba)
                observation = next_observation
                
            G_t.extend(self.compute_G_t(rewards))
            batch_log_probs.extend(log_probs)
            global_rewards.append(sum(rewards))
        
        loss = torch.sum(torch.mul(torch.stack(batch_log_probs), torch.FloatTensor(G_t))).mul(-1)/(len(G_t))
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return global_rewards
    
    def train(self, n_trajectories, n_update):
        """Training method

        Parameters
        ----------
        n_trajectories : int
            The number of trajectories used to approximate the expected gradient
        n_update : int
            The number of gradient updates
            
        """
        
        rewards = []
        for episode in range(n_update):
            rewards.append(self.optimize_model(n_trajectories))
            rewards = np.asarray(rewards)
            print(f'Episode {episode + 1}/{n_update}: rewards {round(rewards[-1].mean(), 2)} +/- {round(rewards[-1].std(), 2)}')
            rewards = rewards.tolist()

        # Plotting
        r = pd.DataFrame((itertools.chain(*(itertools.product([i], rewards[i]) for i in range(len(rewards))))), columns=['Epoch', 'Reward'])
        sns.lineplot(x="Epoch", y="Reward", data=r, ci='sd');
        
    def evaluate(self, render=False):
        """Evaluate the agent on a single trajectory            
        """
        with torch.no_grad():
            observation = self.monitor_env.reset()
            observation = torch.tensor(observation, dtype=torch.float)
            reward_episode = 0
            done = False
                
            while not done:
                action, _ = self.model.select_action(observation)
                observation, reward, done, info = self.monitor_env.step(int(action))
                observation = torch.tensor(observation, dtype=torch.float)
                reward_episode += reward
            
            self.monitor_env.close()
            if render:
                show_video("./gym-results")
                print(f'Reward: {reward_episode}')
            print(f'Reward: {reward_episode}')

In [0]:
env_id = 'CartPole-v1'
learning_rate = 0.01
gamma = 1 
seed = 1235

config = {
    'env_id': env_id,
    'learning_rate': learning_rate,
    'seed': seed,
    'gamma': gamma
}

print("Current config is:")
pprint(config)


In [0]:
agent = BaseAgent(config)
agent.train(n_trajectories=50, n_update=70)

Evaluate the agent over multiple episodes

In [0]:
agent.evaluate(True)

## Policy Evaluation as Supervised Learning

**Q2: Implement batched gradient algorithm**

Define network for Q-function (ValueNetwork) and policy (ActorNetwork)


In [0]:
class ValueNetwork(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out
    
    def predict(self, x):
        return self(x).detach().numpy()[0]

In [0]:
class ActorNetwork(nn.Module):

    def __init__(self, input_size, hidden_size, action_size):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.softmax(self.fc3(out), dim=-1)
        return out
    
    def select_action(self, x):
        input_ = torch.FloatTensor(x).reshape([1, 4])
        output = self.forward(input_)
        c = Categorical(output)
        action = c.sample()
        log_proba = c.log_prob(action)
        return action, log_proba

Implement your (batched) gradient algorithm



In [0]:
class EvalAgent:

    def __init__(self, config, policy):
        self.config = config
        self.env = gym.make(config['env_id'])
        make_seed(config['seed'])
        self.env.seed(config['seed'])
        self.monitor_env = Monitor(self.env, "./gym-results", force=True, video_callable=lambda episode: True)
        self.gamma = config['gamma']
        self.policy = policy

        # Our network
        self.value_network = ValueNetwork(self.env.observation_space.shape[0], 16, 1)

        # optimizers
        self.value_network_optimizer = optim.RMSprop(self.value_network.parameters(), 
                                                   lr=config['value_network']['learning_rate'])


    def optimize_model(self, observations, returns, next_observations, batch_dones):
        # value fonction for each state in the array next_observations. Here we compute $V(s_{t+1})$
        V_s_t_plus_1 = self.value_network.forward(torch.FloatTensor(next_observations)).detach()
        # The value function of a terminal state is zero
        for i in range(len(batch_dones)):
            if batch_dones[i] == True:
                V_s_t_plus_1[i] = 0
        # Now we compute $r_t + \gamma V(s_{t+1})$
        r_t_plus_gamma_V_s_t_plus_1 = torch.FloatTensor(returns).reshape(-1, 1) + self.gamma * V_s_t_plus_1
        # We now compute $V(s_t)$
        V_s_t = self.value_network.forward(torch.FloatTensor(observations))
        # The MSE loss corresponds to $(r_t + \gamma V(s_{t+1}) - V(s_t))^2$ for each t, i.e. for each state s_t visited
        critic_loss = F.mse_loss(V_s_t, r_t_plus_gamma_V_s_t_plus_1)
        # We train the critic
        self.value_network_optimizer.zero_grad()
        critic_loss.backward()
        self.value_network_optimizer.step()

    
    def training_batch(self, epochs, batch_size):
        
        mse_test = []

        for epoch in range(epochs):
            
            batch_observations, batch_next_observations, batch_returns, batch_dones = [], [], [], []

            for i in range(batch_size):
                
                observations, next_observations, returns, dones = [], [], [], []
                
                observation = self.env.reset()
                done = False
                while not done:
                    observation_tensor = torch.FloatTensor(observation).reshape([1, 4])
                    action = self.policy.select_action(observation_tensor)[0][0].item()
                    next_observation, reward, done, info = self.env.step(action)
                                        
                    observations.append(observation)
                    next_observations.append(next_observation)
                    dones.append(done)
                    returns.append(reward)

                    observation = next_observation
                        
                batch_observations.extend(observations)
                batch_next_observations.extend(next_observations)
                batch_returns.extend(returns)
                batch_dones.extend(dones)
                
            self.optimize_model(batch_observations, batch_returns, batch_next_observations, batch_dones)

            # Test it every 50 epochs
            if epoch % 25 == 0 or epoch == epochs - 1:
                L = []
                for _ in range(10):
                    obs_states, y_mc = self.evaluate()
                    obs_states_tensor = torch.FloatTensor(obs_states).detach()
                    y_mc = torch.FloatTensor(y_mc)
                    y_hat = self.value_network.forward(obs_states_tensor).detach()
                    err = y_mc - y_hat
                    err = err.numpy()
                    mse = np.mean(err**2)
                    L.append(mse.item())
                mse_test.append(L)
                print(f'Epoch {epoch}/{epochs}: MSE: {np.mean(mse)}')

                observation = self.env.reset()

        # Plotting
        r = pd.DataFrame((itertools.chain(*(itertools.product([i], mse_test[i]) for i in range(len(mse_test))))), columns=['Epoch', 'MSE'])
        sns.lineplot(x="Epoch", y="MSE", data=r, ci='sd');


    def evaluate(self, render=False):
        """Returns the observations and the estimated V-function (using first visit Monte-Carlo)

        Parameters
        ----------
        rewards : array
            An array of shape (batch_size,) containing the rewards given by the env
        dones : array
            An array of shape (batch_size,) containing the done bool indicator given by the env
        values : array
            An array of shape (batch_size,) containing the values given by the value network
        next_value : float
            The value of the next state given by the value network

        Returns
        -------
        states : array
            Observations
        returns : array
            The estimate value function of each state
        """
        env = self.monitor_env if render else self.env
        observation = env.reset()
        states = [observation.copy()]
        rewards= []
        observation = torch.tensor(observation, dtype=torch.float)
        done = False
        steps = 0

        while not done:
            action = self.policy.select_action(observation)[0]
            observation, reward, done, info = env.step(int(action))
            for i in range(steps):
                rewards[i] = rewards[i] + math.pow(self.gamma, steps-i)*reward
            rewards.append(reward)
            if not done:
                states.append(observation.copy())
            observation = torch.tensor(observation, dtype=torch.float)
            steps += 1

        env.close()
        if render:
            show_video("./gym-results")
            print(f'Reward: {reward_episode}')
        states = np.array(states).reshape(-1, self.env.observation_space.shape[0])
        returns = np.array(rewards).reshape(-1,1)
        return states, returns

Define configuration

In [0]:
env_id = 'CartPole-v1'
value_learning_rate = 0.001
gamma = 0.99
seed = 1

config_td = {
    'env_id': env_id,
    'gamma': gamma,
    'seed': seed,
    'value_network': {'learning_rate': value_learning_rate, 'reference': './CartPole_value.pt'}
}

print("Current config_td is:")
pprint(config_td)


Create policy

In [0]:
env = gym.make(config_td['env_id'])
policy = ActorNetwork(env.observation_space.shape[0], 16, env.action_space.n)
policy.load_state_dict(torch.load('./mvarl_hands_on/data/CartPole_actor.pt'))
state = torch.tensor(np.array(env.reset(), dtype=np.float32))
print("pi(state) = ", policy.select_action(state))
del env

Run agent

In [0]:
agent = EvalAgent(config=config_td, policy=policy)
agent.training_batch(epochs=1000, batch_size=2)

## Advantage Actor Critic (A2C)
**Q3: Implement the A2C method**

As usual we provide a structure you can use as starting point.



**Note:** try to reuse previous parts of previous code.



In [0]:
class A2CAgent:

    def __init__(self, config):
        self.config = config
        self.env = gym.make(config['env_id'])
        make_seed(config['seed'])
        self.env.seed(config['seed'])
        self.monitor_env = Monitor(self.env, "./gym-results", force=True, video_callable=lambda episode: True)
        self.gamma = config['gamma']
        
        # Our two networks
        self.value_network = ValueNetwork(self.env.observation_space.shape[0], 16, 1)
        self.actor_network = ActorNetwork(self.env.observation_space.shape[0], 16, self.env.action_space.n)
        
        # Their optimizers
        self.value_network_optimizer = optim.RMSprop(self.value_network.parameters(), lr=config['value_network']['learning_rate'])
        self.actor_network_optimizer = optim.RMSprop(self.actor_network.parameters(), lr=config['actor_network']['learning_rate'])
        



    def optimize_model(self, observations, actions, returns, next_observations, log_probs, dones):
        # value fonction for each state in the array next_observations. Here we compute $V(s_{t+1})$
        V_s_t_plus_1 = self.value_network.forward(torch.FloatTensor(next_observations)).detach()
        # The value function of a terminal state is zero
        for i in range(len(dones)):
            if dones[i]==True:
                V_s_t_plus_1[i] = 0
        # Now we compute $r_t + \gamma V(s_{t+1})$
        r_t_plus_gamma_V_s_t_plus_1 = torch.FloatTensor(returns).reshape(-1, 1) + self.gamma * V_s_t_plus_1
        # Here we compute $V(s_t)$
        V_s_t = self.value_network.forward(torch.FloatTensor(observations))
        # The MSE loss corresponds to $(r_t + \gamma V(s_{t+1}) - V(s_t))^2$ for each t, i.e. for each state s_t visited
        critic_loss = F.mse_loss(V_s_t, r_t_plus_gamma_V_s_t_plus_1)
        # We train the critic
        self.value_network_optimizer.zero_grad()
        critic_loss.backward()
        self.value_network_optimizer.step()
        
        # Now we train the actor
        # We compute $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$
        advantage = (r_t_plus_gamma_V_s_t_plus_1 - V_s_t.detach())
        # The loss for the actor is defined as $\sum \delta_t \log \pi_{\theta}(a_t, s_t)$ 
        actor_loss = torch.sum(torch.mul(torch.stack(log_probs), advantage)).mul(-1)/(advantage.shape[0])
        # optimization step for the actor
        self.actor_network_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_network_optimizer.step()
        
        
        
        
    def training_batch(self, epochs, batch_size):
        
        returns_test = []
        
        for epoch in range(epochs):
            
            batch_observations, batch_next_observations, batch_actions, batch_returns, batch_log_probs, batch_dones = [], [], [], [], [], []
             
            for i in range(batch_size):
                
                observations, next_observations, actions, returns, log_probs, dones = [], [], [], [], [], []
                
                observation = self.env.reset()
                done = False
                while not done:
                    action, log_proba = self.actor_network.select_action(observation)
                    next_observation, reward, done, info = self.env.step(action.item())
                    
                    observations.append(observation)
                    next_observations.append(next_observation)
                    actions.append(action)
                    dones.append(done)
                    returns.append(reward)
                    log_probs.append(log_proba)
                    
                    observation = next_observation
                        
                batch_observations.extend(observations)
                batch_next_observations.extend(next_observations)
                batch_actions.extend(actions)
                batch_returns.extend(returns)
                batch_log_probs.extend(log_probs)
                batch_dones.extend(dones)

            self.optimize_model(batch_observations, batch_actions, batch_returns, batch_next_observations, batch_log_probs, batch_dones)
        
        
            # Test it every 50 epochs
            if epoch % 50 == 0 or epoch == epochs - 1:
                returns_test.append(np.array([self.evaluate() for _ in range(50)]))
                print(f'Epoch {epoch}/{epochs}: Mean returns: {round(returns_test[-1].mean(), 2)}, Std: {round(returns_test[-1].std(), 2)}')

                # Early stopping
                if returns_test[-1].mean() > 490 and epoch != epochs -1:
                    print('Early stopping !')
                    break
                observation = self.env.reset()
                    
        # Plotting
        r = pd.DataFrame((itertools.chain(*(itertools.product([i], returns_test[i]) for i in range(len(returns_test))))), columns=['Epoch', 'Reward'])
        sns.lineplot(x="Epoch", y="Reward", data=r, ci='sd');
       
        
        
    def evaluate(self, render=False):
        env = self.monitor_env if render else self.env
        observation = env.reset()
        observation = torch.tensor(observation, dtype=torch.float)
        reward_episode = 0
        done = False

        while not done:
            action, _ = self.actor_network.select_action(observation)
            observation, reward, done, info = env.step(int(action))
            observation = torch.tensor(observation, dtype=torch.float)
            reward_episode += reward
            
        env.close()
        if render:
            show_video("./gym-results")
            print(f'Reward: {reward_episode}')
        return reward_episode


 

Create configuration for A2C

In [0]:
env_id = 'CartPole-v1'
value_learning_rate = 0.01
actor_learning_rate = 0.01
gamma = 0.99
entropy = 1
seed = 1

config_a2c = {
    'env_id': env_id,
    'gamma': gamma,
    'seed': seed,
    'value_network': {'learning_rate': value_learning_rate},
    'actor_network': {'learning_rate': actor_learning_rate},
    'entropy': entropy
}

print("Current config_a2c is:")
pprint(config_a2c)

Run the learning agent

In [0]:
agent = A2CAgent(config_a2c)
rewards = agent.training_batch(1000, 2)

Evaluate the agent over multiple episodes

In [0]:
agent.evaluate(True)