In [1]:
from models import Actor, Critic
from DDPG import DDPG_Agent
from tools import PlotTool

from unityagents import UnityEnvironment
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



import numpy as np
import random
from collections import deque

from tqdm import tqdm_notebook as tqdm
from ipywidgets import *
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib notebook

## Load the Enviornment

In [2]:
env         = UnityEnvironment(file_name="../Tennis_Linux/Tennis.x86_64", no_graphics=True)
brain_name  = env.brain_names[0]
brain       = env.brains[brain_name]
env_info    = env.reset(train_mode=True)[brain_name]

DEBUG:unityagents:The true file name is Tennis
DEBUG:unityagents:This is the launch string /home/yousof/AI/courses/deep_RL_udacity/projects/DRLND_Collaboration_Competition_P3/../Tennis_Linux/Tennis.x86_64
INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
num_agents  = len(env_info.agents)
action_size = brain.vector_action_space_size
state_size  = len(env_info.vector_observations[0])
print("state size:",state_size, "action size:", action_size)
print("state shape: ", env_info.vector_observations.shape)

state size: 24 action size: 2
state shape:  (2, 24)


## Train the Agent

In [4]:
agent = DDPG_Agent(state_size=state_size, 
                   action_size=action_size, 
                   actor_model=Actor,
                   critic_model=Critic,
                   device=device,
                   num_agents= num_agents, 
                   seed=1,
                   tau=3e-1,
                   batch_size=2048,
                   discount_factor = 0.99,
                   actor_learning_rate=1e-4,
                   critic_learning_rate=1e-3)

In [5]:
def ddpg(n_episodes=1000, max_t=300, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    score = np.zeros(num_agents)
    
    # commenting and plotting
    desc = Label('Episode {}\tAverage Score: {:.2f}'.format(0,0))
    display(desc)
    tqm = tqdm(range(1, n_episodes+1))
    for i_episode in tqm:
        env_info    = env.reset(train_mode=True)[brain_name]
        states      = env_info.vector_observations
        agent.reset()
        score = 0
        for t in range(max_t):
            statePlotter.push_date(states[0,:].reshape(1,state_size))
            actions_with_noise, actions_without_noise  = agent.act(states,add_noise=True)
#             actionPlotter.push_date(actions_with_noise[0,:].reshape(1,action_size))
            actionPlotterWithoutNoise.push_date(actions_without_noise[0,:].reshape(1,action_size))
            actions = np.clip(actions_with_noise, -1, 1)
            actionPlotter.push_date(actions[0,:].reshape(1,action_size))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            reward = env_info.rewards
            done = env_info.local_done
            agent.step(state      = states,
                       action     = actions,
                       reward     = reward,
                       next_state = next_states, 
                       done       = done
                      )
            states = next_states
            score = score + np.array(reward)
            if np.any(done):
                break 
        actionPlotter.draw(reset_for_next_time=True)
        actionPlotterWithoutNoise.draw(reset_for_next_time=True)
        statePlotter.draw(reset_for_next_time=True)
        rewardPlot.push_date(score.reshape(1, num_agents))
        rewardPlot.draw()
        scores_deque.append(score)
        scores.append(score)
        score_mean_circular = np.mean(scores_deque)
        desc.value = 'Episode {},'.format(i_episode)+ \
                     '\tMinimum Score: {:.2f},'.format(np.min(scores_deque))+ \
                     '\tAverage Score: {:.2f},'.format(score_mean_circular)
        
        
        if score_mean_circular > 0.5:
            # enough training
            agent.save_agent('DDPG_agent')
            break
            
    return scores

%matplotlib notebook
rewardPlot = PlotTool(number_of_lines=num_agents, desc = "R" )
actionPlotterWithoutNoise = PlotTool(number_of_lines=action_size, desc = "AC")
actionPlotter = PlotTool(number_of_lines=action_size, desc = "NAC")
statePlotter = PlotTool(number_of_lines=state_size, desc = "S")
scores = ddpg(n_episodes=10000, max_t=1000)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Label(value='Episode 0\tAverage Score: 0.00')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [6]:
env.close()