### 4. It's Your Turn!

Now it's your turn to train your own agent to solve the environment!  When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:
```python
env_info = env.reset(train_mode=True)[brain_name]
```

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import torch
%matplotlib inline

unity_env = UnityEnvironment(file_name="Banana.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [2]:
class EnvWrapper:
    def __init__(self, env, train_mode=False):
        self.env = env
        self.brain_name = env.brain_names[0]
        self.train_mode = train_mode
        brain = env.brains[self.brain_name]
        self.action_size = brain.vector_action_space_size
    
    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name]
        return env_info.vector_observations[0]
    
    def state_size(self):
        state = self.reset()
        return len(state)
    
    def step(self, action):
        env_info = self.env.step(action)[self.brain_name]        
        next_state = env_info.vector_observations[0]   
        reward = env_info.rewards[0]                  
        done = env_info.local_done[0]
        
        return next_state, reward, done

In [3]:
env = EnvWrapper(env=unity_env, train_mode=True)
state_size = env.state_size()
action_size = env.action_size

In [4]:
def train(agent, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores

In [5]:
def plot_scores(scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [None]:
https://github.com/cstorm125/bananavigator

In [6]:
from ddqn_prioritized_agent import Agent
from ddq_network import QNetwork

create_ddq_network = lambda state_size, action_size, seed: QNetwork(state_size, action_size, seed)

agent = Agent(create_network=create_ddq_network, state_size=state_size, action_size=action_size, seed=0)
scores = train(agent)
plot_scores(scores)


agent = Agent(state_size=state_size, action_size=action_size, seed=0)

errors: tensor(1.00000e-02 *
       [ 1.4095,  1.0848,  0.9783,  1.0502,  0.0712,  0.6644,  1.3470,
         1.3470,  6.7007,  1.0192,  0.0103,  0.0103,  0.3623,  3.1738,
         0.8503,  0.3508,  1.1029,  0.5072,  2.7079,  2.7079,  0.0573,
         0.1666,  2.0118,  1.5115,  2.5257,  0.7268,  0.5698,  0.1049,
         0.1049,  4.4758,  0.0276,  0.0608,  7.9648,  2.9922,  0.0040,
         0.1041,  0.8989,  1.4764,  0.0189,  2.3956,  1.1005,  0.7754,
         0.7754,  3.2650,  0.0057,  1.7169,  1.7169,  0.0192,  9.0603,
         0.1258,  0.0370,  7.8600,  3.6239,  2.1978,  0.0032,  0.0032,
         0.0014,  0.3828,  0.4318,  0.0696,  0.0696,  0.0663,  5.1128,
         0.0811])
w_IS: tensor([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.

loss: 0.007721276953816414
errors: tensor([ 5.0747e-04,  7.2910e-04,  5.7949e-03,  5.5932e-03,  1.1908e-02,
         8.6445e-03,  5.2771e-07,  2.9903e-03,  8.5963e-03,  5.0208e-03,
         2.9212e-03,  8.3777e-04,  1.0206e-02,  3.7747e-03,  1.2508e-02,
         1.2508e-02,  1.6247e-02,  1.4219e-02,  1.0822e-03,  2.7899e-03,
         6.1471e-03,  1.3880e-02,  2.9419e-03,  2.9419e-03,  4.6191e-04,
         2.6909e-07,  3.2772e-03,  2.8049e-04,  3.3642e-03,  1.8671e-02,
         4.3239e-02,  4.3158e-03,  2.6241e-02,  3.6718e-02,  1.5482e-02,
         3.8790e-03,  1.5681e-11,  1.9639e-03,  2.4624e-03,  2.4624e-03,
         4.2786e-03,  9.3458e-03,  9.3458e-03,  2.3121e-03,  4.0433e-03,
         1.6359e-02,  2.9968e-02,  4.7055e-02,  5.0504e-03,  1.8020e-03,
         1.8020e-03,  5.4844e-03,  7.4398e-04,  2.0766e-03,  3.6958e-03,
         7.5471e-03,  6.8946e-03,  3.2449e-03,  3.0802e-05,  1.6149e-03,
         1.6149e-03,  6.9211e-03,  2.0828e-04,  2.0828e-04])
w_IS: tensor([ 0.9358,  0.93

Exception: stop here

In [None]:
unity_env.close()