In [1]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
from collections import deque
import time

os.environ['KMP_DUPLICATE_LIB_OK']='True'



from mlagents_envs.environment import UnityEnvironment
print("Imported Unity Enironment, press run")

%matplotlib inline

#self.env = ObstacleTowerEnv('/home/df/sources/obstacle-tower-challenge/ObstacleTower/obstacletower.x86_64', 
  #      worker_id=get_worker_id(), retro=False)


def get_worker_id(filename="worker_id.dat"):
    with open(filename, 'a+') as f:
        f.seek(0)
        val = int(f.read() or 0) + 1
        f.seek(0)
        f.truncate()
        f.write(str(val))
        return val
    
env_name = "Project"
train_mode = True
env = UnityEnvironment(file_name=None, worker_id=0, seed=1, side_channels=[])
print(str(env))

env.reset()


behavior_name = list(env.behavior_specs)[0] 
print(f"Name of the behavior : {behavior_name}")
spec = env.behavior_specs[behavior_name]
decision_steps, terminal_steps = env.get_steps(behavior_name)


brain_name = list(env.behavior_specs)[0] 
brain = env.behavior_specs[behavior_name]

print(brain)

# reset the environment
env_info = env.reset()

# number of actions
                   
action_size = len(brain[-1][1])*len(brain[-1][1][:])
print('Number of actions:', action_size)

# examine the state space 
state = brain.observation_specs[0][0]
print('States look like:', state)
state_size = brain.observation_specs[0][0][0]
print('States have length:', state_size)

print("Number of observations : ", len(spec.observation_specs))

# Is the Action continuous or multi-discrete ?
if spec.action_spec.continuous_size > 0:
    print(f"There are {spec.action_spec.continuous_size} continuous actions")
if spec.action_spec.is_discrete():
    print(f"There are {spec.action_spec.discrete_size} discrete actions")


    
    
# How many actions are possible ?
#print(f"There are {spec.action_size} action(s)")

# For discrete actions only : How many different options does each action has ?
if spec.action_spec.discrete_size > 0:
    for action, branch_size in enumerate(spec.action_spec.discrete_branches):
        print(f"Action number {action} has {branch_size} different options")

decision_steps, terminal_steps = env.get_steps(behavior_name)
env.set_actions(behavior_name, spec.action_spec.empty_action(len(decision_steps)))
env.step()
    
for index, obs_spec in enumerate(spec.observation_specs):
    if len(obs_spec.shape) == 3:
        print("Here is the first visual observation")
        plt.imshow(decision_steps.obs[index][0,:,:,:])
        plt.show()

for index, obs_spec in enumerate(spec.observation_specs):
    if len(obs_spec.shape) == 1:
        print("First vector observations : ", decision_steps.obs[index][0,:])

        
"""        
for episode in range(3):
    env.reset()
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    tracked_agent = -1 # -1 indicates not yet tracking
    done = False # For the tracked_agent
    episode_rewards = 0 # For the tracked_agent
    while not done:
        # Track the first agent we see if not tracking 
        # Note : len(decision_steps) = [number of agents that requested a decision]
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0] 

        # Generate an action for all agents
        action = spec.action_spec.random_action(len(decision_steps))

        # Set the actions
        env.set_actions(behavior_name, action)

        # Move the simulation forward
        env.step()

        # Get the new simulation results
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if tracked_agent in decision_steps: # The agent requested a decision
            episode_rewards += decision_steps[tracked_agent].reward
        if tracked_agent in terminal_steps: # The agent terminated its episode
            episode_rewards += terminal_steps[tracked_agent].reward
            done = True
    print(f"Total rewards for episode {episode} is {episode_rewards}")
"""

from deepqagent import Agent

def deep_q_network(n_episodes=3, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.0995):
    
    eps = eps_start
    scores = []
    scores_window = deque(maxlen=100)  # last 100 scores
    
    agent = Agent(state_size=state_size, action_size=action_size, seed=0)
    for i in range(1, n_episodes+1): # episodes
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        env_info = env.reset()
        state = np.array([])
        for index, obs_spec in enumerate(spec.observation_specs):
            if len(obs_spec.shape) == 1:
                state = np.concatenate((state,decision_steps.obs[index][0,:]))
        state = decision_steps.obs[0][0,:]
        print(state)
        #state = env_info.vector_observations[0]
        score = 0

        for t in range(max_t): # maximum steps per episode
            action = agent.act(state)
            #action = action.astype(str)
            print(action)
            
            #env.set_actions(behavior_name, action)
            env_info = env.step()                    # send the action to the environment
            next_state = decision_steps.obs[0][0,:]   # get the next state
            #print(decision_steps.reward)
            reward = decision_steps.reward[0]                   # get the reward
            done = terminal_steps.interrupted                  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward                                # update the score
            if done: break 

        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        max_score = max(max_score, score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i, np.mean(scores_window)), end="")
        if i % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i, np.mean(scores_window)))

        # output a checkpoint of the highest scoring episodes
        if max_score > score and score > 13:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')

    return scores
    
scores = deep_q_network()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()



time.sleep(30)

agent = Agent(state_size=37, action_size=4, seed=0)

# load the weights from file
agent.local_dqnet.load_state_dict(torch.load('checkpoint.pth'))

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
score = 0
while True:
    state = decision_steps.obs[0][0,:]
    action = agent.act(state)
    #action = action.astype(str)
    env_info = env.step()
    reward = decision_steps.reward                 # get the reward
    score += reward                                # update the score
    done = terminal_steps.interrupted
    if done: break
        
print("Final Score: {}".format(score))

    
    


env.close()
print("Closed the enviroment :) ")






Imported Unity Enironment, press run
<mlagents_envs.environment.UnityEnvironment object at 0x7fd62006a090>
Name of the behavior : PedestrianAgent?team=0
BehaviorSpec(observation_specs=[ObservationSpec(shape=(36,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='RayPerceptionSensorEyes'), ObservationSpec(shape=(16,), dimension_property=(<DimensionProperty.NONE: 1>,), observation_type=<ObservationType.DEFAULT: 0>, name='VectorSensor_size16')], action_spec=ActionSpec(continuous_size=0, discrete_branches=(3, 3, 3)))
Number of actions: 9
States look like: (36,)
States have length: 36
Number of observations :  2
There are 3 discrete actions
Action number 0 has 3 different options
Action number 1 has 3 different options
Action number 2 has 3 different options
First vector observations :  [1.         0.         0.         0.42       1.         0.
 0.         0.12791343 1.         0.         0.         0.05696705
 1.         0.         0. 



8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


  loss = functional.mse_loss(Q_expected, Q_targets)


8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


UnboundLocalError: local variable 'max_score' referenced before assignment