In [1]:
import torch
import gym
import argparse
import os
import pickle
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader, ConcatDataset
from torch.distributions import Normal
from models import *
from utils import *
from dataset import *
import pybullet_envs
import random


def load_demos(DEMO_DIR):
    try:
        trajs = np.load("experts/states_expert_walker_0.npy")[:10]
    except:
        with open(DEMO_DIR, 'rb') as f:
            trajs = pickle.load(f)

    demos = []
    for t_id, traj in enumerate(trajs):
        demo =[]
        #print(t_id)
        for item in traj:    
            obs = item['observation']
            #obs = list(obs)
            #print(obs)
            demo.append(obs)
        #print(np.array(demo).shape)
        demos.append(np.array(demo))

    print(np.array(demos).shape)
    demos = demos[:10]
    return demos


In [2]:
env_list = ["Pendulum-v0", "BipedalWalker-v3", "Walker2DBulletEnv-v0", "HopperBulletEnv-v0", "HalfCheetahBulletEnv-v0", "AntBulletEnv-v0", "HumanoidBulletEnv-v0"]

runs = 20
inv_samples = 1000
max_steps = 800
expert_path='experts/'
weight_path="weights/"
        
test_rewards_envs = []
record_folder = "records/bco/"
init_seeds = [0]
itr_per_env = len(init_seeds)

for itr_id in range(itr_per_env):
    seed = init_seeds[itr_id]
    for en in env_list[2:]:
        print("############# start "+en+" training ###################")

        ENV_NAME = en#env_list[3]
        env=ENV_NAME
        
        DEMO_DIR = os.path.join(expert_path, env+'.pkl')
        M = inv_samples

        record_fn = record_folder + ENV_NAME + str(itr_id) + ".txt"

        """load demonstrations"""
        demos = load_demos(DEMO_DIR)

        """create environments"""
        env = gym.make(ENV_NAME)
        obs_dim, act_dim = env.observation_space.shape[0], env.action_space.shape[0]

        """init random seeds for reproduction"""
        torch.manual_seed(seed)
        env.seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        env.action_space.seed(seed)

        policy = policy_continuous(env.observation_space.shape[0],64,env.action_space.shape[0])#.cuda()
        inv_model = inv_dynamics_continuous(env.observation_space.shape[0],100,env.action_space.shape[0])#.cuda()
        
        """start training"""
        inv_dataset_list = []
        use_policy = False

        transitions = []
        for steps in range(runs):
            print('######## STEP %d #######'%(steps+1))
            ### GET SAMPLES FOR LEARNING INVERSE MODEL
            print('Collecting transitions for learning inverse model....')
            if steps > 0:
                use_policy = True


            trans_samples, avg_reward = gen_inv_samples(env, policy.cpu(), M, 'continuous', use_policy,max_steps=max_steps)
            #print(np.array(trans_samples).shape, transitions)
            transitions = transitions+trans_samples
            
            f = open(record_fn, "a+")
            f.write(str(avg_reward) + "\n")
            f.close()
            
            print('Done!', np.array(transitions).shape)

            ### LEARN THE INVERSE MODEL
            print('Learning inverse model....')
            inv_dataset = transition_dataset(transitions)
            inv_dataset_list.append(inv_dataset)
            inv_dataset_final = ConcatDataset(inv_dataset_list)
            inv_loader = DataLoader(inv_dataset_final, batch_size=1024, shuffle=True, num_workers=4)

            inv_opt = optim.Adam(inv_model.parameters(), lr=1e-3, weight_decay=0.0001)
            inv_loss = nn.MSELoss()
            #inv_loss = nn.L1Loss()

            for epoch in range(100): 
                running_loss = 0
                for i, data in enumerate(inv_loader):
                    s, a, s_prime = data
                    inv_opt.zero_grad()
                    a_pred = inv_model(s.float(), s_prime.float())
                    loss = inv_loss(a_pred, a.float())
                    loss.backward()
                    running_loss += loss.item()
                    if i%100000 == 0:
                        print('Epoch:%d Batch:%d Loss:%.5f'%(epoch, i+1, running_loss/100))
                        running_loss = 0
                    inv_opt.step()
                if epoch%10 == 0:
                        print('Epoch:%d Batch:%d Loss:%.5f'%(epoch, i+1, loss))
            print('Done!')

            ### GET ACTIONS FOR DEMOS
            inv_model.cpu()
            print('Getting labels for demos....')
            trajs = get_action_labels(inv_model, demos, 'continuous')
            print('Done!')
            bc_dataset = imitation_dataset(trajs, type='action')
            bc_loader = DataLoader(bc_dataset, batch_size=1024, shuffle=True, num_workers=4)
            inv_model

            ### PERFORM BEHAVIORAL CLONING
            print('Learning policy....')
            policy
            bc_opt = optim.Adam(policy.parameters(), lr=1e-3, weight_decay=0.0001)
            bc_loss = nn.MSELoss()
            # bc_loss = nn.L1Loss()

            for epoch in range(50):  
                running_loss = 0
                for i, data in enumerate(bc_loader):
                    s, a = data
                    bc_opt.zero_grad()
                    """
                    a_mu, a_sigma = policy(s.float())
                    a_pred = Normal(loc=a_mu, scale=a_sigma).rsample()
                    """
                    a_pred = policy(s.float())
                    loss = bc_loss(a_pred, a)
                    running_loss += loss.item()
                    loss.backward()
                    if i%20 == 19:
                        running_loss = 0
                    bc_opt.step()
                if epoch%10==0:
                    print('Epoch:%d Batch:%d Loss:%.3f'%(epoch, i+1, loss))

            print('Done!')

        torch.save(policy, weight_path+ENV_NAME+str(itr_id)+'_bco.pt')

############# start Walker2DBulletEnv-v0 training ###################
(49, 1000, 22)




######## STEP 1 #######
Collecting transitions for learning inverse model....
reward: 13.612366641615518 setps: 9 count: 9
reward: 15.333661909894728 setps: 11 count: 20
reward: 14.06518672312959 setps: 10 count: 30
reward: 22.825537883289506 setps: 26 count: 56
reward: 20.030772349816107 setps: 20 count: 76
reward: 16.999050246212573 setps: 16 count: 92
reward: 15.545413191551052 setps: 14 count: 106
reward: 16.226392472333103 setps: 13 count: 119
reward: 14.878472657156816 setps: 18 count: 137
reward: 17.093303792305232 setps: 15 count: 152
reward: 25.38579114184395 setps: 28 count: 180
reward: 15.62311114539043 setps: 12 count: 192
reward: 25.67762851235602 setps: 21 count: 213
reward: 27.122905671574703 setps: 28 count: 241
reward: 16.43561664125446 setps: 12 count: 253
reward: 19.666378375614293 setps: 19 count: 272
reward: 13.895775478085852 setps: 9 count: 281
reward: 15.099087875016266 setps: 10 count: 291
reward: 17.484739339732915 setps: 15 count: 306
reward: 16.4583119884278



Done!
Getting labels for demos....
Done!
Learning policy....
Epoch:0 Batch:10 Loss:0.070
Epoch:10 Batch:10 Loss:0.043
Epoch:20 Batch:10 Loss:0.034
Epoch:30 Batch:10 Loss:0.027
Epoch:40 Batch:10 Loss:0.023
Done!
######## STEP 2 #######
Collecting transitions for learning inverse model....
reward: 27.70992305003019 setps: 23 count: 23
reward: 24.30870364998409 setps: 20 count: 43
reward: 28.14831889427005 setps: 23 count: 66
reward: 31.856974145905404 setps: 27 count: 93
reward: 25.19174199342378 setps: 21 count: 114
reward: 24.67039610021747 setps: 21 count: 135
reward: 23.50047305035987 setps: 20 count: 155
reward: 29.111291222188456 setps: 25 count: 180
reward: 24.745713377342323 setps: 21 count: 201
reward: 25.20011719787435 setps: 21 count: 222
reward: 22.684262043521446 setps: 19 count: 241
reward: 22.75751888943487 setps: 19 count: 260
reward: 24.91400860860449 setps: 21 count: 281
reward: 24.381351894572433 setps: 21 count: 302
reward: 23.57206429193902 setps: 21 count: 323
rewar

reward: 83.21225123064798 setps: 64 count: 112
reward: 87.34980113455528 setps: 69 count: 181
reward: 95.48670055959231 setps: 72 count: 253
reward: 94.84628457068175 setps: 78 count: 331
reward: 75.3364926222508 setps: 60 count: 391
reward: 76.83131012528024 setps: 63 count: 454
reward: 29.172565433582342 setps: 23 count: 477
reward: 88.4841854680373 setps: 72 count: 549
reward: 89.33303137749607 setps: 70 count: 619
reward: 81.04748217390004 setps: 63 count: 682
reward: 103.54271847527998 setps: 84 count: 766
reward: 95.38515274180531 setps: 76 count: 842
reward: 99.31917582450987 setps: 79 count: 921
reward: 98.23324892977396 setps: 79 count: 1000
avg rewards: 78.84952174374374
Done! (7000, 3)
Learning inverse model....
Done!
Getting labels for demos....
Done!
Learning policy....
Epoch:0 Batch:10 Loss:0.018
Epoch:10 Batch:10 Loss:0.017
Epoch:20 Batch:10 Loss:0.012
Epoch:30 Batch:10 Loss:0.015
Epoch:40 Batch:10 Loss:0.015
Done!
######## STEP 8 #######
Collecting transitions for learn

Epoch:0 Batch:100 Loss:0.01254
Epoch:1 Batch:100 Loss:0.00847
Epoch:2 Batch:100 Loss:0.00850
Epoch:3 Batch:100 Loss:0.00856
Epoch:4 Batch:100 Loss:0.00868
Epoch:5 Batch:100 Loss:0.00863
Epoch:6 Batch:100 Loss:0.00867
Epoch:7 Batch:100 Loss:0.00866
Epoch:8 Batch:100 Loss:0.00861
Epoch:9 Batch:100 Loss:0.00874
Epoch:10 Batch:100 Loss:0.00875
Epoch:11 Batch:100 Loss:0.00878
Epoch:12 Batch:100 Loss:0.00877
Epoch:13 Batch:100 Loss:0.00875
Epoch:14 Batch:100 Loss:0.00872
Epoch:15 Batch:100 Loss:0.00880
Epoch:16 Batch:100 Loss:0.00880
Epoch:17 Batch:100 Loss:0.00870
Epoch:18 Batch:100 Loss:0.00877
Epoch:19 Batch:100 Loss:0.00873
Epoch:20 Batch:100 Loss:0.00878
Epoch:21 Batch:100 Loss:0.00875
Epoch:22 Batch:100 Loss:0.00878
Epoch:23 Batch:100 Loss:0.00882
Epoch:24 Batch:100 Loss:0.00874
Epoch:25 Batch:100 Loss:0.00884
Epoch:26 Batch:100 Loss:0.00871
Epoch:27 Batch:100 Loss:0.00870
Epoch:28 Batch:100 Loss:0.00887
Epoch:29 Batch:100 Loss:0.00877
Epoch:30 Batch:100 Loss:0.00868
Epoch:31 Batch:100

reward: 87.85264659883546 setps: 66 count: 751
reward: 89.03585727064198 setps: 67 count: 818
reward: 84.3591761335061 setps: 64 count: 882
reward: 76.64350509285433 setps: 59 count: 941
avg rewards: 81.78622169141745
Done! (16000, 3)
Learning inverse model....
Epoch:0 Batch:100 Loss:0.01095
Epoch:1 Batch:100 Loss:0.00782
Epoch:2 Batch:100 Loss:0.00792
Epoch:3 Batch:100 Loss:0.00787
Epoch:4 Batch:100 Loss:0.00799
Epoch:5 Batch:100 Loss:0.00794
Epoch:6 Batch:100 Loss:0.00796
Epoch:7 Batch:100 Loss:0.00802
Epoch:8 Batch:100 Loss:0.00799
Epoch:9 Batch:100 Loss:0.00815
Epoch:10 Batch:100 Loss:0.00799
Epoch:11 Batch:100 Loss:0.00804
Epoch:12 Batch:100 Loss:0.00815
Epoch:13 Batch:100 Loss:0.00796
Epoch:14 Batch:100 Loss:0.00788
Epoch:15 Batch:100 Loss:0.00805
Epoch:16 Batch:100 Loss:0.00803
Epoch:17 Batch:100 Loss:0.00810
Epoch:18 Batch:100 Loss:0.00804
Epoch:19 Batch:100 Loss:0.00796
Epoch:20 Batch:100 Loss:0.00805
Epoch:21 Batch:100 Loss:0.00806
Epoch:22 Batch:100 Loss:0.00795
Epoch:23 Bat

KeyboardInterrupt: 