# Hill Climbing with Adaptive Noise Scaling

## 1. Import packages


In [2]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

!python -m pip install pyvirtualdisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display

plt.ion()



ModuleNotFoundError: No module named 'pyvirtualdisplay'

## 2. Instantiate Env and Agent

In [3]:
device = torch.device('cpu')
env = gym.make('MountainCarContinuous-v0')
env.seed(101)
np.random.seed(101)

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)

class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
        
    def set_weights(self, weights):
        s_size = self.s_size
        h_size = self.h_size
        a_size = self.a_size
        # separate the weights for each layer
        fc1_end = (s_size*h_size)+h_size
        fc1_W = torch.from_numpy(weights[:s_size*h_size].reshape(s_size, h_size))
        fc1_b = torch.from_numpy(weights[s_size*h_size:fc1_end])
        fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
        fc2_b = torch.from_numpy(weights[fc1_end+(h_size*a_size):])
        # set the weights for each layer
        self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
        self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
        self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
        self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
    
    def get_weights_dim(self):
        return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.tanh(self.fc2(x))
        return x.cpu().data #action
        
    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.set_weights(weights) #update agent nn weights
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return
    
agent = Agent(env).to(device)

observation space: Box(2,)
action space: Box(1,)
  - low: [-1.]
  - high: [1.]


In [4]:
env.reset()

array([-0.48382922,  0.        ])

## 3. Train the Agent with Hill Climbing with Adaptive Noise Scaling

As explained [here](https://towardsdatascience.com/three-aspects-of-deep-rl-noise-overestimation-and-exploration-122ffb4bb92b), 

>The adaptive noise scaling for our model is realized as follows. If the current value of the target function is better than the best value obtained for the target function, we divide the noise scale by 2, and this noise is added to the weight matrix. If the current value of the target function is worse than the best obtained value, we multiply the noise scale by 2, and this noise is added to the best obtained value of the weight matrix. In both cases, a noise scale is added with some random factor different for any element of the matrix.

In [14]:
"""
env = gym.make('CartPole-v0')
env.seed(0)
np.random.seed(0)

policy = Policy()
"""
def hill_climbing(agent, n_episodes=1000, max_t=1000, gamma=1.0, print_every=10, noise_scale=0.5, sigma=0.5):
    """Implementation of hill climbing with adaptive noise scaling.
        
    Params
    ======
        - n_episodes (int): maximum number of training episodes
        - max_t (int): maximum number of timesteps per episode
        - gamma (float): discount rate
        - print_every (int): how often to print average score (over last 100 episodes)
        - noise_scale (float): standard deviation of additive noise
        - sigma (float): standard deviation of additive noise
    """
    scores_deque = deque(maxlen=100)
    scores = []
    best_R = -np.Inf
    #replace: best_w = policy.w
    best_w = noise_scale*np.random.randn(agent.get_weights_dim())
    agent.set_weights(best_w)
    
    for i_episode in range(1, n_episodes+1):
        rewards = []
        state = agent.env.reset()
        rewards = np.array([agent.evaluate(best_w, gamma, max_t) for weights in weights_pop])
        
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([discount*reward for discount,reward in zip(discounts, rewards)])
        print('noise_scale=', noise_scale)
        if R >= best_R: # found better weights
            best_R = R
            noise_scale = max(1e-3, noise_scale / 2)
            best_w += noise_scale * np.random.rand(agent.get_weights_dim()) 
            agent.set_weights(best_w)
    
        else: # did not find better weights
            noise_scale = min(2, noise_scale * 2)
            best_w += noise_scale * np.random.rand(agent.get_weights_dim())
            agent.set_weights(best_w)
    
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            agent.set_weights(best_w)
            break
        
    return scores

def cem_adaptive_scaling_noise(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5, is_adaptive=False):
    """PyTorch implementation of the cross-entropy method.
        
    Params
    ======
        n_iterations (int): maximum number of training iterations
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
        pop_size (int): size of population at each iteration
        elite_frac (float): percentage of top performers to use in update
        sigma (float): standard deviation of additive noise
    """
    n_elite=int(pop_size*elite_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = sigma*np.random.randn(agent.get_weights_dim())
    best_reward = -np.Inf
    
    for i_iteration in range(1, n_iterations+1):
        weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
        rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])

        elite_idxs = rewards.argsort()[-n_elite:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)

        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        

        
        torch.save(agent.state_dict(), 'checkpoint.pth')
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
            if is_adaptive:
                # adaptive scaling noise
                if np.mean(scores_deque) >= best_reward:
                    best_reward = reward
                    sigma = max(1e-3, sigma / 2)
                    print('performance improving=>decreasing sigma',sigma)
                else:
                    sigma = min(2, sigma*2)
                    print('performance worsen=>increasing sigma',sigma)

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

scores = cem_adaptive_scaling_noise()

Episode 10	Average Score: -1.49
performance improving=>decreasing sigma 0.25
Episode 20	Average Score: -1.08
performance improving=>decreasing sigma 0.125
Episode 30	Average Score: -0.73
performance worsen=>increasing sigma 0.25
Episode 40	Average Score: -0.60
performance worsen=>increasing sigma 0.5
Episode 50	Average Score: -0.78
performance worsen=>increasing sigma 1.0
Episode 60	Average Score: -1.72
performance worsen=>increasing sigma 2
Episode 70	Average Score: -8.17
performance worsen=>increasing sigma 2
Episode 80	Average Score: -8.31
performance worsen=>increasing sigma 2
Episode 90	Average Score: -8.22
performance worsen=>increasing sigma 2
Episode 100	Average Score: -8.03
performance worsen=>increasing sigma 2
Episode 110	Average Score: -8.23
performance worsen=>increasing sigma 2
Episode 120	Average Score: -9.00
performance worsen=>increasing sigma 2
Episode 130	Average Score: -9.53
performance worsen=>increasing sigma 2
Episode 140	Average Score: -9.93
performance worsen=>

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()