# 3-Collborative Reinofrcement Learning

---

In this notebook, we will train and evaluate our model for the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893).

### 1. Start the Environment

We begin by importing some necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [None]:
from unityagents import UnityEnvironment
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


### 2. Define the Agent 
We first define and intialize a multi agent DDPG agent which creates two instances of DDPG agents (one for each player). This agent is directly imported here from ma_ddgp_agent.py script. Please refer to ma_ddgp_agent.py and ddpg_agent.py script to define a new agent or modify the existing one 

In [3]:
from ddpg_agent import Agent
from ma_ddgp_agent import maddpgagent

brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
state_dim = env_info.vector_observations.shape[1]
action_dim = brain.vector_action_space_size
# number of agents
num_agents = len(env_info.agents)

agent = maddpgagent(state_dim=state_dim, action_dim=action_dim,num_agents = num_agents, seed=0)

### 3. Train the agent 
Run the code below to train the agent from scratch


In [4]:
from collections import deque
import torch
import numpy as np
import math
import time


def interact(env,state_dim, brain_name, agent, num_agents,max_t=1500, num_episodes=1500, window=100):
    scores = []
    PRINT_EVERY=10
    rolling_avg=[]
    best_score=0
    scores_window = deque(maxlen=window)
    for i_episode in range(1, num_episodes+1):
        # Reset env and get current state
        env_info = env.reset(train_mode=True)[brain_name]
        states = np.reshape(env_info.vector_observations, (1,48)) # flatten states
        score = np.zeros(num_agents)
        agent.reset()
        while True:
            actions = agent.ma_act(states)
            env_info = env.step(actions)[brain_name]
            next_states = np.reshape(env_info.vector_observations, (1,48)) # flatten states
            rewards = env_info.rewards
            dones = env_info.local_done
            agent.ma_step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards
            if any(dones):
                break
        scores.append(np.max(score))
        scores_window.append(np.max(score))
        rolling_avg.append(np.mean(scores_window))
        # print results
        if i_episode % PRINT_EVERY == 0:
            print('Episodes {:0>4d}\tMax Reward: {:.3f}\tMoving Average: {:.3f}'.format(i_episode, np.max(score), np.mean(scores_window)))
        # print('\rEpisode {}\tMax Reward: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, np.max(scores_all[-PRINT_EVERY:]),np.mean(scores_window)))
        if np.mean(scores_window)>=0.5:
            torch.save(agent.agents[0].actor_local.state_dict(), './logging/checkpoint_actor_0.pth')
            torch.save(agent.agents[0].critic_local.state_dict(), './logging/checkpoint_critic_0.pth')
            torch.save(agent.agents[1].actor_local.state_dict(), './logging/checkpoint_actor_1.pth')
            torch.save(agent.agents[1].critic_local.state_dict(), './logging/checkpoint_critic_1.pth')
            scores_filename = "./logging/ma_ddpg_agent_score_" +str(i_episode) + ".csv"
            rolling_avg_filename = "./logging/ma_ddpg_agent_rolling_avg_" +str(i_episode) + ".csv"
            np.savetxt(scores_filename, scores, delimiter=",")
            np.savetxt(rolling_avg_filename, rolling_avg, delimiter=",")
        if np.mean(scores_window)>=best_score:
            torch.save(agent.agents[0].actor_local.state_dict(), './logging/best_checkpoint_actor_0.pth')
            torch.save(agent.agents[0].critic_local.state_dict(), './logging/best_checkpoint_critic_0.pth')
            torch.save(agent.agents[1].actor_local.state_dict(), './logging/best_checkpoint_actor_1.pth')
            torch.save(agent.agents[1].critic_local.state_dict(), './logging/best_checkpoint_critic_1.pth')
            best_score = np.mean(scores_window)
        if i_episode % window == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
    return scores

scores = interact(env, state_dim, brain_name, agent,num_agents)

Episodes 0010	Max Reward: 0.000	Moving Average: 0.000
Episodes 0020	Max Reward: 0.000	Moving Average: 0.005
Episodes 0030	Max Reward: 0.000	Moving Average: 0.003
Episodes 0040	Max Reward: 0.000	Moving Average: 0.003
Episodes 0050	Max Reward: 0.000	Moving Average: 0.002
Episodes 0060	Max Reward: 0.000	Moving Average: 0.002
Episodes 0070	Max Reward: 0.000	Moving Average: 0.001
Episodes 0080	Max Reward: 0.000	Moving Average: 0.001
Episodes 0090	Max Reward: 0.000	Moving Average: 0.001
Episodes 0100	Max Reward: 0.000	Moving Average: 0.001
Episode 100	Average Score: 0.00
Episodes 0110	Max Reward: 0.000	Moving Average: 0.003
Episodes 0120	Max Reward: 0.000	Moving Average: 0.002
Episodes 0130	Max Reward: 0.000	Moving Average: 0.002
Episodes 0140	Max Reward: 0.000	Moving Average: 0.005
Episodes 0150	Max Reward: 0.090	Moving Average: 0.008
Episodes 0160	Max Reward: 0.000	Moving Average: 0.010
Episodes 0170	Max Reward: 0.000	Moving Average: 0.013
Episodes 0180	Max Reward: 0.000	Moving Average: 0.

Episodes 1450	Max Reward: 1.500	Moving Average: 0.664
Episodes 1460	Max Reward: 0.300	Moving Average: 0.653
Episodes 1470	Max Reward: 1.200	Moving Average: 0.591
Episodes 1480	Max Reward: 0.500	Moving Average: 0.592
Episodes 1490	Max Reward: 0.100	Moving Average: 0.519
Episodes 1500	Max Reward: 0.100	Moving Average: 0.530
Episode 1500	Average Score: 0.53


In [2]:
from plot import plot_results

plot_results(benchmark_score=0.5)

### 3. Watch a smart agent
Run the code below to watch a smart agent navigating inside the enviornment


In [None]:
env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64")
# reset env and extract state_dim and action_dim
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=False)[brain_name]    # reset the environment  
state_dim = env_info.vector_observations.shape[1]
action_dim = brain.vector_action_space_size
# number of agents
num_agents = len(env_info.agents)
agent = maddpgagent(state_dim=state_dim, action_dim=action_dim,num_agents = num_agents, seed=0)

#Watch a smart agent 
# Load the saved training parameters

agent_0 = Agent(state_dim, action_dim, 1, random_seed=0)
agent_1 = Agent(state_dim, action_dim, 1, random_seed=0)
agent_0.actor_local.load_state_dict(torch.load('logging/checkpoint_actor_0.pth', map_location='cpu'))
agent_0.critic_local.load_state_dict(torch.load('logging/checkpoint_critic_0.pth', map_location='cpu'))
agent_1.actor_local.load_state_dict(torch.load('logging/checkpoint_actor_1.pth', map_location='cpu'))
agent_1.critic_local.load_state_dict(torch.load('logging/checkpoint_critic_1.pth', map_location='cpu'))
  
states = env_info.vector_observations                  # get the current state (for each agent)
states = np.reshape(states, (1,48))
score = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    # actions = agent.ma_act(states)
    action_0 = agent_0.act(states, add_noise=False)         
    action_1 = agent_1.act(states, add_noise=False)        
    actions = np.concatenate((action_0, action_1), axis=0) 
    actions = np.reshape(actions, (1, 4))
    env_info = env.step(actions)[brain_name]
    next_states = np.reshape(env_info.vector_observations, (1,48)) # flatten states
    rewards = env_info.rewards
    dones = env_info.local_done
    states = next_states
    score += rewards
    if any(dones):
        break

In [None]:
env.close()