In [22]:
import os
import pickle
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt

# TASK = "Ant-v2"
# TASK = "HalfCheetah-v2"
# TASK = "Hopper-v2"
# TASK = "Humanoid-v2"
# TASK = "Reacher-v2"
# TASK = "Walker2d-v2"

# TASK_LIST = ["Ant-v2", "HalfCheetah-v2", "Hopper-v2", "Humanoid-v2", "Reacher-v2", "Walker2d-v2"]
TASK_LIST = ["Ant-v2", "Humanoid-v2"]

DIRNAME_experts = "experts"
DIRNAME_expert_data = "expert_data"
DIRNAME_MODELS = "BC_models_bonus"
DIRNAME_output = "output"

BATCH_SIZE = 64
EPOCHS = 100

#### Problem 4
    Two new model:
    1. Replace Activation ReLU by tanh
    2. Add one more hidden layer and change the size of all hidden layer from (64,) to (128,)

In [20]:
def build_train_save_model(X_train, Y_train, model_path, batch_size=BATCH_SIZE, epochs=EPOCHS):
    
    print("The shapes of training input and output: X:{}, Y:{}".format(X_train.shape, Y_train.shape))
    sample_size = X_train.shape[0]
    input_size = X_train.shape[-1]
    output_size = Y_train.shape[-1]
    
    model = tf.keras.Sequential()
    
    # Hidden layer 1: input -> output (64,)
    model.add( tf.keras.layers.Dense(128, input_dim=input_size) )
    model.add( tf.keras.layers.Activation("relu") )
    
    # Hidden layer 2: output (64,) -> output (64,)
    model.add( tf.keras.layers.Dense(128) )
    model.add( tf.keras.layers.Activation("relu") )
    
    # Hidden layer 3: output (64,) -> output (64,)
    model.add( tf.keras.layers.Dense(128) )
    model.add( tf.keras.layers.Activation("relu") )
    
    # Hidden layer 3: output (64,) -> output
    model.add( tf.keras.layers.Dense(output_size) )
    
    model.compile(loss="mse", optimizer="Adam")
    print("Training model ...")
    model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=epochs, verbose=1)
    
    model_dir = os.path.dirname(model_path)
    if not os.path.isdir( model_dir ):
        os.makedirs(model_dir)
        
    model.save(model_path)

    
for task in TASK_LIST:
    model_name = "BC_model_" + task + ".h5"
    model_path = os.path.join(DIRNAME_MODELS, model_name)
    
    if not os.path.exists(model_path):
        print("Task {}. Behavioral Cloning bonus.".format(task))
        datafile = os.path.join(DIRNAME_expert_data, task + ".pkl")
        
        with open(datafile, 'rb') as f:
            expert_data = pickle.load(f)

        X_train = expert_data["observations"]
        Y_train = np.squeeze( expert_data["actions"] )
        
        build_train_save_model(X_train, Y_train, model_path)
        print()
        
    else:
        print("Model of " + task + " exists in the directory " + DIRNAME_MODELS + '.')
        print()

Task Ant-v2. Behavioral Cloning bonus.
The shapes of training input and output: X:(20000, 111), Y:(20000, 8)
Training model ...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/1

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

Task Humanoid-v2. Behavioral Cloning bonus.
The shapes of training input and output: X:(20000, 376), Y:(20000, 17)
Training model ...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Ep

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100



In [24]:
def load_test_model(task, model_path):

    model = tf.keras.models.load_model(model_path)
    
    envname = task
    env = gym.make(envname)
    max_steps = env.spec.timestep_limit
    num_rollouts = 20
    
    rewards = []
    observations = []
    actions = []
    for i in range(num_rollouts):
        print("iter", i)
        obs = env.reset()
        done = False
        total_reward = 0
        steps = 0
        while not done:
            action = model.predict(obs.reshape(1, -1), verbose=0)
            observations.append(obs)
            actions.append(action)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            steps += 1
            if False:
                env.render()
#             if steps % 100 == 0:
#                 print("{}/{}".format(steps, max_steps))
            if steps >= max_steps:
                break
        rewards.append(total_reward)
        
    mean_reward, std_reward = np.mean(rewards), np.std(rewards)
    print("rewards", rewards)
    print("mean reward", mean_reward)
    print("std of reward", std_reward)
    
    ret_dict = {"rewards": rewards, "mean reward": mean_reward, "std of reward": std_reward}
    
    return ret_dict


for task in TASK_LIST:
    model_name = "BC_model_" + task + ".h5"
    model_path = os.path.join(DIRNAME_MODELS, model_name)
    
    if os.path.exists(model_path):
        print("Task {}. Behavioral Cloning bonus".format(task))
        load_test_model(task, model_path)
        print()
        
    else:
        print("Model of " + task + " doest not exist. Build and train it first.")
        print()

Task Ant-v2. Behavioral Cloning bonus
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
iter 0
iter 1
iter 2
iter 3
iter 4
iter 5
iter 6
iter 7
iter 8
iter 9
iter 10
iter 11
iter 12
iter 13
iter 14
iter 15
iter 16
iter 17
iter 18
iter 19
rewards [4914.105459113547, 4874.280496122403, 4649.264697694852, 4735.383648726234, 4743.513090966455, 4937.337810790904, 4874.82003013037, 4723.360965866456, 4897.6961228759465, 4661.103690603662, 4793.506741975252, 4770.019226850782, 4749.182726033616, 4847.972421813718, 4912.251795760525, 4747.5160645845435, 4579.10091213297, 4836.944451576873, 4857.078340684965, 4871.440878757997]
mean reward 4798.793978653104
std of reward 96.81645668813218

Task Humanoid-v2. Behavioral Cloning bonus
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtyp