# Walker2D

You aim in this task is to train the agent to win in Walker2D game with Actor-Critic, Advantage Actor Critic (A2C), Trust-region Policy Optimization (TRPO) or Proximal Policy Optimization (PPO). 
To solve the task feel free to transform the state and reward from the environment.

**Scoring**: Calculating the average reward for 50 episodes. You goal is to gain more than 1000 points.

**Submission format**: send you notebook and trained model in **zipped** folder.


### Read me, please:
***I've had to make some changes in the code to compute the whole thing using 'GPU' device. This made training much faster and 30%+ better results within the same time as using 'CPU'***

In [2]:
import pybullet_envs
from gym import make
import numpy as np
import torch
from torch import nn
from torch.distributions import Normal
from torch.nn import functional as F
from torch.optim import Adam
import random
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() 
                                                     else torch.FloatTensor)
print(device)

cuda


***The cell down below is the test GPU allocation from [pybullet guide](https://colab.research.google.com/drive/1u6j7JOqM05vUUjpVp5VNk0pd8q-vqGlx )***

In [3]:
#you can enable the GPU by changing the runtime
import os
os.environ['MESA_GL_VERSION_OVERRIDE'] = '3.3'
os.environ['MESA_GLSL_VERSION_OVERRIDE'] = '330'
import pybullet as p
import pybullet_data as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pylab
%matplotlib inline
p.connect(p.DIRECT)
#allow to find the assets (URDF, obj, textures etc)
p.setAdditionalSearchPath(pd.getDataPath())
#optionally enable GPU for faster rendering in pybullet.getCameraImage
enableGPU = False
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
import GPUtil as GPU
import sys
# Get all device ids and their processing and memory utiliazion
# (deviceIds, gpuUtil, memUtil) = GPU.getGPUs()

# Print os and python version information
print('OS: ' + sys.platform)
print(sys.version)

# Print package name and version number
print(GPU.__name__ + ' ' + GPU.__version__)

# Show the utilization of all GPUs in a nice table
GPU.showUtilization()

# Show all stats of all GPUs in a nice table
GPU.showUtilization(all=True)

# NOTE: If all your GPUs currently have a memory consumption larger than 1%,
# this step will fail. It's not a bug! It is intended to do so, if it does not
# find an available GPU.
GPUs = GPU.getGPUs()
numGPUs = len(GPU.getGPUs())
print("numGPUs=",numGPUs)
if numGPUs > 0:
  enableGPU = True
eglPluginId = -1
if enableGPU:
  import pkgutil
  egl = pkgutil.get_loader('eglRenderer')
  if (egl):
    eglPluginId = p.loadPlugin(egl.get_filename(), "_eglRendererPlugin")
  else:
    eglPluginId = p.loadPlugin("eglRendererPlugin")

if eglPluginId>=0:
  print("Using GPU hardware (eglRenderer)")  
else:
  print("using CPU renderer (TinyRenderer)")

OS: win32
3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:30:19) [MSC v.1929 64 bit (AMD64)]
GPUtil 1.4.0
| ID | GPU | MEM |
------------------
|  0 | 18% | 14% |
| ID | Name                               | Serial | UUID                                     || GPU temp. | GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  0 | NVIDIA GeForce RTX 3070 Laptop GPU | [N/A]  | GPU-b3f26e7e-2bdc-a2de-6d3b-d86b9359a503 ||       42C |       18% |          14% ||       8192MB |      1142MB |      6901MB || Enabled      | Enabled        |
numGPUs= 1
using CPU renderer (TinyRenderer)


# Train

In [4]:
ENV_NAME = "Walker2DBulletEnv-v0"

LAMBDA = 0.95
GAMMA = 0.99

ACTOR_LR = 2e-5
CRITIC_LR = 1e-5

CLIP = 0.2
ENTROPY_COEF = 1e-2
BATCHES_PER_UPDATE = 2048
BATCH_SIZE = 64

MIN_TRANSITIONS_PER_UPDATE = 2048
MIN_EPISODES_PER_UPDATE = 4

ITERATIONS = 1000

    
def compute_lambda_returns_and_gae(trajectory):
    lambda_returns = []
    gae = []
    last_lr = 0.
    last_v = 0.
    for _, _, r, _, v in reversed(trajectory):
        ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
        last_lr = ret
        last_v = v
        lambda_returns.append(last_lr)
        gae.append(last_lr - v)
    
    # Each transition contains state, action, old action probability, value estimation and advantage estimation
    return [(s, a, p, v, adv) for (s, a, _, p, _), v, adv in zip(trajectory, reversed(lambda_returns), reversed(gae))]
    


class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        # Advice: use same log_sigma for all states to improve stability
        # You can do this by defining log_sigma as nn.Parameter(torch.zeros(...))
        self.model =  nn.Sequential(
              nn.Linear(state_dim, 256),
              nn.ELU(),
              nn.Linear(256, 256),
              nn.ELU(),
              nn.Linear(256, action_dim))
        self.sigma = nn.Parameter(torch.ones(action_dim))
        
    def compute_proba(self, state, action):
        # Returns probability of action according to current policy and distribution of actions
        ''' YOUR CODE HERE '''
        mu = self.model(state)
        dist = Normal(mu, torch.exp(self.sigma))
        prob = torch.exp(dist.log_prob(action).sum(-1))
        return prob, dist
        
    def act(self, state):
        # Returns an action (with tanh), not-transformed action (without tanh) and distribution of non-transformed actions
        # Remember: agent is not deterministic, sample actions from distribution (e.g. Gaussian)
        ''' YOUR CODE HERE '''
        mu = self.model(state)
        dist = Normal(mu, torch.exp(self.sigma))
        action = dist.sample()
        return torch.tanh(action), action, dist
        
        
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ELU(),
            nn.Linear(256, 256),
            nn.ELU(),
            nn.Linear(256, 1)
        )
        
    def get_value(self, state):
        return self.model(state)


class PPO:
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)
        self.actor_optim = Adam(self.actor.parameters(), ACTOR_LR)
        self.critic_optim = Adam(self.critic.parameters(), CRITIC_LR)

    def update(self, trajectories):
        transitions = [t for traj in trajectories for t in traj] # Turn a list of trajectories into list of transitions
        state, action, old_prob, target_value, advantage = zip(*transitions)
        state = np.array(state)
        action = np.array(action)
        old_prob = np.array(old_prob)
        target_value = np.array(target_value)
        advantage = np.array(advantage)
        advnatage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
        
        
        for _ in range(BATCHES_PER_UPDATE):
            idx = np.random.randint(0, len(transitions), BATCH_SIZE) # Choose random batch
            s = torch.tensor(state[idx], device=device).float()
            a = torch.tensor(action[idx], device=device).float()
            op = torch.tensor(old_prob[idx], device=device).float() # Probability of the action in state s.t. old policy
            v = torch.tensor(target_value[idx], device=device).float() # Estimated by lambda-returns 
            adv = torch.tensor(advantage[idx], device=device).float() # Estimated by generalized advantage estimation 
            
            ''' YOUR CODE HERE '''
            # TODO: Update actor here
            # calculate ratios
            new_prob, dist = self.actor.compute_proba(s, a)
            ratio = new_prob / op

            # actor_loss
            surr_loss = ratio * adv
            clipped_surr_loss = (
                torch.clamp(ratio, 1.0 - CLIP, 1.0 + CLIP) * adv
            )

            # entropy
            entropy = dist.entropy().mean()

            actor_loss = (
                - torch.min(surr_loss, clipped_surr_loss).mean()
                - entropy * ENTROPY_COEF
            )

            self.actor_optim.zero_grad()
            actor_loss.backward(retain_graph=True)
            self.actor_optim.step()


            # TODO: Update critic here
            # critic_loss
            value = self.critic.get_value(s).flatten()
            # critic_loss = (v - value).pow(2).mean()
            critic_loss = F.smooth_l1_loss(value, v)

            # train critic
            self.critic_optim.zero_grad()
            critic_loss.backward(retain_graph=True)
            self.critic_optim.step()
            
            
    def get_value(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state]), device=device).float()
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state]), device=device).float()
            action, pure_action, distr = self.actor.act(state)
            prob = torch.exp(distr.log_prob(pure_action).sum(-1))
        return action.cpu().numpy()[0], pure_action.cpu().numpy()[0], prob.cpu().item()

    def save(self):
        torch.save(self.actor, "agent.pkl")


def evaluate_policy(env, agent, episodes=5):
    returns = []
    for _ in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0.
        
        while not done:
            state, reward, done, _ = env.step(agent.act(state)[0])
            total_reward += reward
        returns.append(total_reward)
    return returns
   

def sample_episode(env, agent):
    s = env.reset()
    d = False
    trajectory = []
    while not d:
        a, pa, p = agent.act(s)
        v = agent.get_value(s)
        ns, r, d, _ = env.step(a)
        trajectory.append((s, pa, r, p, v))
        s = ns
    return compute_lambda_returns_and_gae(trajectory)

In [13]:
env = make(ENV_NAME)
ppo = PPO(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
state = env.reset()
episodes_sampled = 0
steps_sampled = 0

for i in range(ITERATIONS):
    trajectories = []
    steps_ctn = 0
    
    while len(trajectories) < MIN_EPISODES_PER_UPDATE or steps_ctn < MIN_TRANSITIONS_PER_UPDATE:
        traj = sample_episode(env, ppo)
        steps_ctn += len(traj)
        trajectories.append(traj)
    episodes_sampled += len(trajectories)
    steps_sampled += steps_ctn

    ppo.update(trajectories)        
    
    if (i + 1) % (ITERATIONS//100) == 0:
        rewards = evaluate_policy(env, ppo, 5)
        print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}, Episodes: {episodes_sampled}, Steps: {steps_sampled}")
        ppo.save()
        if np.mean(rewards) >= 1000: break

print (f'\nModel was evaluated on step {i+1} with results:\n\tReward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}, Episodes: {episodes_sampled}, Steps: {steps_sampled}')

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(

Step: 10, Reward mean: 32.76445962179802, Reward std: 13.257043339272526, Episodes: 1280, Steps: 20634
Step: 20, Reward mean: 38.772235054442135, Reward std: 13.820719706283594, Episodes: 1606, Steps: 41524
Step: 30, Reward mean: 58.101272445375514, Reward std: 13.331322066129085, Episodes: 1872, Steps: 62481
Step: 40, Reward mean: 54.69662415395578, Reward std: 6.798903638883825, Episodes: 2145, Steps: 83382
Step: 50, Reward mean: 64.48028512960767, Reward std: 7.989975867493289, Episodes: 2418, Steps: 104143
Step: 60, Reward mean: 66.52254532065476, Reward std: 5.044019096295827, Episodes: 2693, Steps: 124965
Step: 70, Reward mean: 75.18439597885387, Reward std: 12.587622258590557, Episodes: 2977, Steps: 145822
Step: 80, Reward mean: 71.91264332223625, Reward std: 15.715663688913914, Episodes: 3244, Steps: 166785
Step: 90, Reward mean: 75.4565837763998, Reward std: 17.194107402261448, Episodes: 3480, Steps: 187709
St

# Agent

In [8]:
class Agent:
    def __init__(self):
        self.model = torch.load("agent.pkl")
        
    def act(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array(state), device=device).float()
            ''' YOUR CODE HERE '''
            actions = self.model(state)
            return np.argmax(actions.numpy())

    def reset(self):
        pass

Agent()

<__main__.Agent at 0x1ff6f667fa0>