In [1]:
# train agent in environment
#0 import and initialize environment, get info
#0 import and setup agent(s)
# define train_dqn(env, agent, ...)
# 1. Train Top Agent 
#    log: ep, avg. score per 100, min/max/avg, epsilon, duration
#    persist: scores, epsilons
#    save: /run_name/: checkpoints, final, scores, epsilons
#    plot: scores, avg over 100, epsilons, solved_line
# 2. Grid Search Training
#    input: set of dictionaries 'Param' -> Value or default
#    log+: solved?, episode_solved
#    persist: top_agent
#    

In [7]:
import numpy as np
import time
from collections import namedtuple, deque

from util import env_initialize, env_reset, state_reward_done_unpack
from util import EpsilonService
from dqn_agent import DQN_Params, DQN_Agent

from unityagents import UnityEnvironment

import matplotlib.pyplot as plt
%matplotlib inline

seed = 0

In [4]:
# create unity environment
env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe")

# gather scenario information, used globally throughout notebook
brain, brain_name, state, action_size, state_size = env_initialize(env)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:
# Optional: uncomment lines below to demo a uniform-random agent
#from util import demo_random_agent
#demo_random_agent(env, n_episodes=2, train_mode=False)

In [6]:
# todo: create agent without params
params = DQN_Params(name='Default DQN')
agent = DQN_Agent(state_size, action_size, seed=seed, params=params)
#scores, ep_stats = agent.train(env, n_episodes=500)

## Training Function

In [9]:
def train_dqn(env, agent, run_name='default_dqn', n_episodes=1000, goal_score=15, print_every=100,
              eps_start=1.0, eps_end=0.001, eps_decay=0.995):
    scores = []
    scores_window = deque(maxlen=100) # last 100 episode scores
    duration_winow = deque(maxlen=5) # last 5 episode durations
    
    # initialize epsilon
    epsilon_svc = EpsilonService(
        method='decay', start_value=eps_start, end_value=eps_end, 
        decay_rate=eps_decay, n_episodes=n_episodes)
    epsilon = epsilon_svc.get_value()
    
    training_start_time = time.time()
    for i_episode in range(1, n_episodes+1):
        # reset for new episode        
        score = 0
        state = env_reset(env, brain_name, train_mode=True)
        
        # run episode
        episode_start_time = time.time()
        while True:
            # choose and take action
            action = int(agent.act(state, epsilon))
            env_info = env.step(action)[brain_name]
            next_state, reward, done = state_reward_done_unpack(env_info)
            
            # update agent with new state and reward
            agent.step(state, action, reward, next_state, done)
            score += reward
            
            state = next_state # update state for next timestep
            if done:
                break
        
        # decay
        epsilon = epsilon_svc.update(i_episode)
        
        # track progress
        duration = time.time() - start_time
        scores.append(score)
        scores_window.append(score)
        duration_window.append(duration)
        
        # display progress and save checkpoints
        avg_score = np.mean(scores_window)
        avg_duration = np.mean(duration_window)
        print('\rEpisode {}\tAverage Score: {:.2f}\tAvg. Duration: {:.4f}s'.format(
            i_episode, avg_score, avg_duration), end="")
        
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tAvg. Duration: {:.4f}s'.format(i, avg_score, avg_duration))
            # todo: save checkpoint weights
            
        if avg_score > goal_score:
            print('\rEnvironment solved in {} episodes!'.format(i_episode))
            print('\rAverage Score for last 100 episodes: {:.2f}'.format(avg_score))
            print('\rTotal Duration: {:.2f}m'.format(time.time() - training_start_time))
            # todo: save solved model weights and print
            return
    
    # finished all episodes
    print('\rComplete training on {} episodes.'.format(n_episodes))
    print('\rAverage Score for last 100 episodes: {:.2f}'.format(np.mean(scores_window)))
    print('\rTotal Duration: {:.2f}m'.format(time.time() - training_start_time))
    # todo: save trained model weights and print        
            