# Imitation Learning: Behavioural Cloning and the DAGGER Algorithm

## 1. Import the Necessary Packages

In [1]:
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
import os, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)

import gym
import numpy as np
import pybullet_envs
import os.path
import time


## 2. Instantiate the Environment, Agent, and Expert Demonstrator

In [2]:
from flagrun_expert_demonstrator import *

gui = True
env = gym.make("HumanoidFlagrunBulletEnv-v0")
if (gui):
  env.render(mode="human")

import torch
import torch.nn as nn
import torch.nn.functional as f 
                            
class StudentPolicy(nn.Module):
    "Simple multi-layer perceptron policy, no internal state"
    def __init__(self, observation_space, action_space):
        super(StudentPolicy, self).__init__()
        self.weights_dense1 = nn.Linear(observation_space.shape[0], 256) 
        self.weights_dense2 = nn.Linear(256, 128) 
        self.weights_dense_final = nn.Linear(128, action_space.shape[0]) 

        torch.nn.init.xavier_uniform_(self.weights_dense1.weight)
        torch.nn.init.xavier_uniform_(self.weights_dense2.weight)
        torch.nn.init.xavier_uniform_(self.weights_dense_final.weight)
        
        self.weights_dense1.bias.data.fill_(0.01)
        self.weights_dense2.bias.data.fill_(0.01)
        self.weights_dense_final.bias.data.fill_(0.01)

    def forward(self, x):
        x = f.relu(self.weights_dense1(x))
        x = f.relu(self.weights_dense2(x))
        x = self.weights_dense_final(x)
        return x

                            
        
def rollout_for_one_episode(policy= ExpertPolicy(env.observation_space, env.action_space)):
    '''
    
    Rollout a particular policy for a single episode.
    
    '''
    
    rollout_data = {'observations':[], 'actions':[]}
    pi = policy
    
    frame = 0
    score = 0
    restart_delay = 0
    obs = env.reset()
    from itertools import count
    for t in count():
        rollout_data['observations'].append(obs)
        a = pi(torch.Tensor(obs)).data.numpy()
        import pdb
        rollout_data['actions'].append(a)
        obs, r, done, _ = env.step(a)
        score += r
        frame += 1
        if (gui):
          time.sleep(1./60)

        still_open = env.render("human")

        if still_open==False:
            return
        if not done: continue
        if restart_delay==0:
            print("score=%0.2f in %i frames" % (score, frame))
            if still_open!=True:      # not True in multiplayer or non-Roboschool environment
                break
            restart_delay = 60*2  # 2 sec at 60 fps
        restart_delay -= 1
        if restart_delay==0: break
    return rollout_data


def rollout_for_n_episodes(n, policy= ExpertPolicy(env.observation_space, env.action_space)):
    '''
    Rollout a particular policy for a n episodes.
    
    '''
    rollout_data = {'observations':[], 'actions':[]}
    for i in range(n):
        print('episode', i)
        recent_rollout_data = rollout_for_one_episode(policy)
        rollout_data['observations'].extend(recent_rollout_data['observations'])
        rollout_data['actions'].extend(recent_rollout_data['actions'])
    return rollout_data

WalkerBase::__init__ start




## 3. Train the Agent with Behavioural Cloning

Press 'w' in the pybullet GUI to turn on wireframe mode to render more quickly.

In [3]:
from torch.utils import data
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as f

class Dataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, X, Y):
        'Initialization'
        self.X=X
        self.Y=Y

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        X = self.X[index]
        Y = self.Y[index]
        return X, Y

def train_model(policy, training_data):
    '''
    Given a dict of training data, train a policy network
    using supervised learning.
    
    '''
    
    dataset = Dataset(training_data['observations'], training_data['actions'])
    dataloader = data.DataLoader(dataset, batch_size = 128)
    
    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    mse = nn.MSELoss()
    for ne in range(100):
        for obs, act in dataloader:

            obs = Variable(obs)
            act = Variable(act)

            policy.zero_grad()
            optimizer.zero_grad()

            predicted_action = policy(obs)
            loss = mse(predicted_action, act.float())

            loss.backward(retain_graph=True)
            optimizer.step()

            print("Epoch: {}, Total loss: {}".format(ne, loss))
    return policy
            
def evaluate_model(policy, data):
    '''
    Evaluate a policy on a list of recorded observations.
    '''
    actions = []
    for obs in data:
        obs = Variable(torch.Tensor(obs))
        predicted_action = policy(obs)
        actions.append(predicted_action.data.numpy())
    return actions

def behavioural_cloning(expert_policy, student_policy):
    '''
    Given an expert demonstrator and a student policy, perform
    n iterations of dagger.
    
    '''
    # collect initial expert demonstrations
    n=10
    print('Rolling Out Expert')
    expert_rollout_data = rollout_for_n_episodes(10, expert_policy)
    # train initial student model with behavioural cloning
    student_policy = train_model(student_policy, expert_rollout_data)
    return student_policy


expert_policy= ExpertPolicy(env.observation_space, env.action_space)
student_policy = StudentPolicy(env.observation_space, env.action_space)   

behavioural_cloning(expert_policy, student_policy)


Rolling Out Expert
episode 0




score=655.09 in 1000 frames




episode 1
score=913.55 in 1000 frames
episode 2
score=-413.89 in 1000 frames
episode 3
score=1197.03 in 1000 frames
episode 4
score=657.30 in 1000 frames
episode 5
score=483.65 in 1000 frames
episode 6
score=-108.99 in 1000 frames
episode 7
score=504.91 in 1000 frames
episode 8
score=-55.65 in 58 frames
episode 9


error: Not connected to physics server.

In [None]:
student_rollout_data = rollout_for_n_episodes(10, student_policy)

## 4. Train the Agent with the DAGGER Algorithm

In [None]:
def dagger(expert_policy, student_policy, n_dagger_iterations):
    '''
    Given an expert demonstrator and a student policy, perform
    n iterations of dagger.
    
    '''
    # collect initial expert demonstrations
    n=10
    expert_rollout_data = rollout_for_n_episodes(n, expert_policy)
    # train initial student model with behavioural cloning
    trained_student = train_model(student_policy, expert_rollout_data)
    
    for i in range(n_dagger_iterations):
        # rollout student model
        student_rollout_data = rollout_for_n_episodes(3, student_policy)
        # evaluate expert actions on student's trajectories and add to dataset
        expert_corrections = evaluate_model(expert_policy, student_rollout_data['observations'])
        training_data = {'observations': expert_rollout_data['observations'] + student_rollout_data['observations'],
                         'actions':      expert_rollout_data['actions']      + expert_corrections}
        # train student model with behavioural cloning
        student_policy =  train_model(student_policy, training_data)
        
    return student_policy

In [None]:
expert_policy= ExpertPolicy(env.observation_space, env.action_space)
student_policy = StudentPolicy(env.observation_space, env.action_space)   
dagger(expert_policy, student_policy, n_dagger_iterations = 2)

## 6. Explore

In this exercise, we have implemented the behavioural cloning and DAGGER algorithms, and demonstrated how to use them to solve a pybullet Gym environment. To continue your learning, you are encouraged to complete any (or all!) of the following tasks:

- Plot the  behavioural cloning student policy's average reward for a variety of numbers of episodes of expert data, and compare to the expert.
- Change the environment to 'HumanoidFlagrunHarderBulletEnv-v0' environment, and run behavioural cloning on 10 episodes of expert data. Watch the visualization. Does behavioural cloning work better than for previous environment? Why?
- Try and reduce the amount of expert data needed for Dagger to work on 'HumanoidFlagrunHarderBulletEnv-v0'. Can you reach an average reward of 500 over ten episodes, using only a total of 100 frames of expert data?

solutions: 


- reward increases slowly...
- Works better because the blocks push it off-distribution so you widen the expert's trajectory distribution (it knows how to correct).
- trick is to (1) run loads of iterations of dagger, (2) use temporally-distant examples.