In [1]:
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from unityagents import UnityEnvironment
import sys

In [2]:
from agent import Agent # Import the Agent class

agent = Agent(state_size=37, action_size=4, seed=0) # Instantiate an agent


cpu


In [3]:
if sys.platform == "darwin":
    env = UnityEnvironment(file_name="Banana.app")
    # If running on Mac, use Banana.app
elif sys.platform == "linux":
    env = UnityEnvironment(file_name="Banana_Linux/Banana.x86")
    # If running on Linux, use Banana.x86

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# Set up the brain

In [5]:
def dqn(n_episodes = 200000, eps_start = 1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                         # list containing scores from each episode
    scores_window = deque(maxlen=100)   # last 100 scores
    scores_means = []                   # list containing average scores from the previous 100 scores from the last episode
    eps = eps_start                     # initialize epsilon

    blues_window = deque(maxlen=100)
    blues_means = []
    yellows_window = deque(maxlen=100)
    yellows_means = []
    """Track yellow and blue bananas per episode and averages of each over a 100 episode window
    
        blues_window: number of blue bananas collected in each of the last 100 episodes
        yellows_window: number of yellow bananas collected in each of the last 100 episodes
        blues_means: Average number of blue bananas collected in the last 100 episodes
        yellows_means: Average number of yellows bananas collected in the last 100 episodes
    """
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]            # get the current state
        score = 0                                          # initialize the score
        blues = 0                                          # initialize number of blue bananas
        yellows = 0                                        # initialize number of yellow bananas
        while True:                                        # continue the episode until done
            action = agent.act(state, eps)                 # step the environment
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            if reward == 1:
                yellows += 1                               # Increment yellow banana count on positive reward
            elif reward == -1:
                blues += 1                                 # Increment blue banana count on positive reward
            done = env_info.local_done[0]                  # see if episode has finished
            agent.step(state, action, reward, next_state, done) # step the agent
            state = next_state                             # next_state becomes current state for next iteration
            score += reward                                # update the score with the reward
            if done:
                break                                      # if episode is complete, break
        blues_window.append(blues)           # add number of blues to blues window
        yellows_window.append(yellows)       # add number of yellows to yellow window
        scores_window.append(score)          # add most recent score to last 100
        scores.append(score)                 # add most recent score to list of all scores
        scores_means.append(np.mean(scores_window)) # add average of scores from the past game to the window
        blues_means.append(np.mean(blues_window))   # get the blues average from the last 100 episodes and append to blues averages
        yellows_means.append(np.mean(yellows_window)) # get the yellows average from the last 100 episodes and append to yellow averages
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window) > 13:
            print("Solved in {} episodes".format(i_episode))
            break
    return scores, scores_means, blues_means, yellows_means

scores, scores_means, blues_means, yellows_means = dqn()

agent.save() # save the trained model parameters

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# plot the average scores and average blues and yellows for the last 100
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(np.arange(len(scores_means)), scores_means, 'k-')
ax.plot(np.arange(len(blues_means)), blues_means, 'b-')
ax.plot(np.arange(len(yellows_means)), yellows_means, 'y-')
plt.ylabel('Average score')
plt.xlabel('Episode #')
plt.show()

Episode 100	Average Score: 0.92
Episode 200	Average Score: 4.39
Episode 300	Average Score: 7.82
Episode 400	Average Score: 10.72


KeyboardInterrupt: 

Initial commit
Just a basic port of the DQN solution for the new environment plus some flourishes including platform check  
Default Performance:  
BATCH_SIZE = 64, Deep Model  
Episode 100	Average Score: 0.45  
Episode 200	Average Score: 2.93  
Episode 300	Average Score: 6.74  
Episode 400	Average Score: 9.01  
Episode 500	Average Score: 12.92  
Solved in 502 episodes  

![b64deeper.png](Plots/b64deeper.png)
![b64deeper-average.png](Plots/b64deeper-average.png)

Tuning

BATCH_SIZE = 64, Standard Model  
Episode 100	Average Score: 0.31  
Episode 200	Average Score: 3.35  
Episode 300	Average Score: 8.00  
Episode 400	Average Score: 10.55  
Solved in 484 episodes  

![b64standard.png](Plots/b64standard.png)
![b64standard-average.png](Plots/b64standard-average.png)

BATCH_SIZE = 256, Standard Model  
Episode 100	Average Score: 0.58  
Episode 200	Average Score: 4.28  
Episode 300	Average Score: 6.50  
Episode 400	Average Score: 10.46  
Solved in 475 episodes  

![b256standard.png](Plots/b256standard.png)
![b256standard-average.png](Plots/b256standard-average.png)

BATCH_SIZE = 256, Standard Model, 16 Banana solution  
Episode 100	Average Score: 0.58  
Episode 200	Average Score: 4.28  
Episode 300	Average Score: 6.50  
Episode 400	Average Score: 10.46  
Episode 500	Average Score: 13.42  
Episode 600	Average Score: 14.67  
Episode 700	Average Score: 15.02  
Episode 800	Average Score: 14.96  
Solved in 859 episodes  

![b256standard-16.png](Plots/b256standard-16.png)
![b256standard-average-16.png](Plots/b256standard-average-16.png)

BATCH_SIZE = 256, Deep Model  
Episode 100	Average Score: 0.91  
Episode 200	Average Score: 3.69  
Episode 300	Average Score: 7.45  
Episode 400	Average Score: 10.17  
Solved in 499 episodes  

![b256deeper.png](Plots/b256deeper.png)
![b256deeper-average.png](Plots/b256deeper-average.png)

BATCH_SIZE = 256, Deep Model, 16 Banana Solution  
Episode 100	Average Score: 0.88  
Episode 200	Average Score: 3.27  
Episode 300	Average Score: 7.24  
Episode 400	Average Score: 9.66  
Episode 500	Average Score: 13.23  
Episode 600	Average Score: 14.31  
Episode 700	Average Score: 14.44  
Episode 800	Average Score: 14.31  
Episode 900	Average Score: 15.00  
Episode 1000	Average Score: 15.55  
Episode 1100	Average Score: 15.76  
Episode 1200	Average Score: 14.69  
Episode 1300	Average Score: 14.28  
Episode 1400	Average Score: 14.41  
Episode 1500	Average Score: 15.42  
Episode 1600	Average Score: 15.11  
Episode 1700	Average Score: 15.68  
Episode 1800	Average Score: 14.11  
Episode 1900	Average Score: 14.43  
Episode 2000	Average Score: 14.12  
Episode 2100	Average Score: 15.39  
Episode 2200	Average Score: 15.17  
Episode 2300	Average Score: 14.74  
Episode 2400	Average Score: 14.72  
Episode 2500	Average Score: 15.01  
Episode 2600	Average Score: 14.93  
Episode 2700	Average Score: 14.82  
Episode 2800	Average Score: 14.93  
Episode 2900	Average Score: 14.44  
Episode 3000	Average Score: 14.83  
Episode 3100	Average Score: 14.13  
Episode 3200	Average Score: 13.84  
Episode 3300	Average Score: 14.53  
Episode 3400	Average Score: 14.04  
Episode 3500	Average Score: 13.56  
![b256deeper-16.png](Plots/b256deeper-16.png)
![b256deeper-average-16.png](Plots/b256deeper-average-16.png)

BATCH_SIZE = 256, Standard Model, 20 Banana Solution  

Episode 100	Average Score: 0.58  
Episode 200	Average Score: 4.28  
Episode 300	Average Score: 6.50  
Episode 400	Average Score: 10.46  
Episode 500	Average Score: 13.42  
Episode 600	Average Score: 14.67  
Episode 700	Average Score: 15.02  
Episode 800	Average Score: 14.96  
Episode 900	Average Score: 16.15  
Episode 1000	Average Score: 15.26  
Episode 1100	Average Score: 15.86  
Episode 1200	Average Score: 15.74  
Episode 1300	Average Score: 16.49  
Episode 1400	Average Score: 16.44  
Episode 1500	Average Score: 16.40  
Episode 1600	Average Score: 15.66  
Episode 1700	Average Score: 16.41  
Episode 1800	Average Score: 15.62  
Episode 1900	Average Score: 14.98  
Episode 2000	Average Score: 15.21  
Episode 2100	Average Score: 15.72  
Episode 2200	Average Score: 15.22  
Episode 2300	Average Score: 15.13  
Episode 2400	Average Score: 14.74  
Episode 2500	Average Score: 15.32  
Episode 2600	Average Score: 15.00  
Episode 2700	Average Score: 15.68  
Episode 2800	Average Score: 16.13  
Episode 2900	Average Score: 15.38  
Episode 3000	Average Score: 15.62  
Episode 3100	Average Score: 15.50  
Episode 3200	Average Score: 14.91  
Episode 3300	Average Score: 16.10  
Episode 3400	Average Score: 16.41  
Episode 3500	Average Score: 15.94  
![b256deeper-20.png](Plots/b256deeper-20png)
![b256deeper-average-20png](Plots/b256deeper-average-20png)
