# Collaboration and Competition

---

Congratulations for completing the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program!  In this notebook, you will learn how to control agents in a more challenging environment, where the goal is to train a team of agents to play soccer.  **Note that this exercise is optional!**

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
from mlagents_envs.environment import UnityEnvironment
import numpy as np
from mlagents_envs.base_env import ActionTuple, BaseEnv, DecisionSteps, TerminalSteps
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

from soccer_gym import TransUnity2Gym
engine_channel = EngineConfigurationChannel()

engine_channel.set_configuration_parameters(time_scale=5,quality_level=0)

#origin_env = UnityEnvironment(file_name="C://Users//raman//Documents//Pengsong//MIE1075_Soccer//buildmysoccer//SoccerTwos", seed=1, side_channels=[engine_channel])
origin_env = UnityEnvironment(file_name="C://Users//raman//Documents//Pengsong//MIE1075_Soccer//osoccer//UnityEnvironment", seed=1, no_graphics=False,side_channels=[engine_channel])
env = TransUnity2Gym(origin_env)

In [2]:
for i in range(20):
    obs_n, reward_n, done_n, _ = origin_env.step([np.array([0,0,0]),np.random.randint(0, 3, size=3),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0]),np.array([0,0,0])])
    print(reward_n)

TypeError: step() takes 1 positional argument but 2 were given

In [1]:
class TransUnity2Gym():
    def __init__(self,env):
        env.reset()
        self.environment  = env
        self.calculate_agent_num()
        
    def calculate_agent_num(self):
        agent_sum = 0
        behavior_names = list(self.environment.behavior_specs)
        for behavior_name in behavior_names:
            decision_steps, terminal_steps = self.environment.get_steps(behavior_name)
            agent_sum += len(decision_steps)
        self.n = agent_sum
        print('self.agent_sum = ',self.n)
        
    def reset(self):
        
        env = self.environment
        env.reset()
        behavior_names = list(env.behavior_specs)
        obs_n=[0 for _ in range(self.n)]
        
        for behavior_name in behavior_names:
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            
            if len(terminal_steps.agent_id) > 0:
                for agent_id_terminated in terminal_steps:
                    obs  = terminal_steps[agent_id_terminated].obs
                    obs_s = np.concatenate([obs[0],obs[1]])
                    obs_n[agent_id_terminated] = obs_s

                
            if len(decision_steps.agent_id) > 0:
                for agent_id_decision in decision_steps:
                     
                    obs  = decision_steps[agent_id_decision].obs
                    obs_s = np.concatenate([obs[0],obs[1]])
                    obs_n[agent_id_decision] = obs_s
                
                
        return obs_n
    
    def step(self,action_n):
        
        env = self.environment
        behavior_names = list(env.behavior_specs)   

        next_obs_n = [0 for _ in range(self.n)]
        reward_n = [0 for _ in range(self.n)]
        done_n = [False for _ in range(self.n)]
        info = [0 for _ in range(self.n)]        
        
        for behavior_name in behavior_names:
            
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            print('pp_terminal_steps.agent_id',terminal_steps.agent_id)
            print('pp_decision_steps.agent_id',decision_steps.agent_id)
            
            if(len(terminal_steps.agent_id)==0):
                action = []
                for i_d in decision_steps.agent_id:
                    action.append(action_n[i_d])

                action = np.array(action)
                action_tuple = ActionTuple()
                action_tuple.add_discrete(action)
                env.set_actions(behavior_name,action_tuple)
                env.step()
            else:
                for agent_id_terminated in terminal_steps:
                    
                    done = terminal_steps[agent_id_terminated].interrupted
                    obs  = terminal_steps[agent_id_terminated].obs
                    reward = terminal_steps[agent_id_terminated].reward
                    obs_s = np.concatenate([obs[0],obs[1]])
                    
                    done_n[agent_id_terminated] = done
                    next_obs_n[agent_id_terminated] = obs_s
                    reward_n[agent_id_terminated] = reward
                return next_obs_n, reward_n, done_n, info


            
        for behavior_name in behavior_names:
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            
            #print('decision_steps.agent_id',decision_steps.agent_id,len(decision_steps.agent_id))
            #print('terminal_steps.agent_id',terminal_steps.agent_id,len(terminal_steps.agent_id))

            #print('decision_steps.obs',decision_steps.obs[0].shape)        
            #print('terminal_steps.obs',terminal_steps.obs[0].shape)
            
            #print('decision_steps.reward',decision_steps.reward)        
            #print('terminal_steps.reward',terminal_steps.reward)
            
            obs_s = np.concatenate([decision_steps.obs[0],decision_steps.obs[1]],axis=1)
            #print(obs_s.shape)
            #print('decision_steps.reward',decision_steps.reward)
            
            #print('terminal_steps.reward',terminal_steps.reward)
            #print('terminal_steps.interrupted=',terminal_steps.interrupted) 
            
            local_done = False
            
            if len(terminal_steps.agent_id) > 0:
                for agent_id_terminated in terminal_steps:
                    
                    done = terminal_steps[agent_id_terminated].interrupted
                    obs  = terminal_steps[agent_id_terminated].obs
                    reward = terminal_steps[agent_id_terminated].reward
                    obs_s = np.concatenate([obs[0],obs[1]])
                    
                    done_n[agent_id_terminated] = done
                    next_obs_n[agent_id_terminated] = obs_s
                    reward_n[agent_id_terminated] = reward
                    
                    print('dddone=',done,'agent_id_terminated=',agent_id_terminated) 
                
            if len(decision_steps.agent_id) > 0:
                print('decision_steps.agent_id=',decision_steps.agent_id)
                for agent_id_decision in decision_steps:
                     
                    obs  = decision_steps[agent_id_decision].obs
                    reward = decision_steps[agent_id_decision].reward
                    obs_s = np.concatenate([obs[0],obs[1]])
                    
                    next_obs_n[agent_id_decision] = obs_s
                    reward_n[agent_id_decision] = reward
                    
                    #print('agent_id_decision=',agent_id_decision) 

                

                

        if done_n[0] == True:  
            print(len(next_obs_n),next_obs_n[0].shape,reward_n,done_n)
        return next_obs_n, reward_n, done_n, info
        

In [None]:
env = TransUnity2Gym(envunity)


for j in range(20):
    env.reset()
    print('epoch-----',j)
    for i in range(5000):
        actions = [np.random.randint(0, 3, size=3),np.random.randint(0, 3, size=3),np.random.randint(0, 3, size=3),np.random.randint(0, 3, size=3)]
        print('step-----',i)
        next_obs_n, reward_n, done_n, info = env.step(actions)
        if done_n[0] ==True:
            break
    

In [2]:
env = TransUnity2Gym(envunity)

NameError: name 'envunity' is not defined

In [12]:
env.reset()

[array([1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.25487724, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.15981403, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.1623513 , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.6123384 , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.49686074, 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.4488938 , 0.        , 0.  

In [18]:
for i in range(10): 
    actions = [np.array([0,0,0]),np.random.randint(0, 3, size=3),np.array([0,0,0]),np.array([0,0,0])]
    env.step(actions)

pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 2]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [1 3]
decision_steps.agent_id= [0 2]
decision_steps.agent_id= [1 3]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 2]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [1 3]
decision_steps.agent_id= [0 2]
decision_steps.agent_id= [1 3]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 2]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [1 3]
decision_steps.agent_id= [0 2]
decision_steps.agent_id= [1 3]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 2]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [1 3]
decision_steps.agent_id= [0 2]
decision_steps.agent_id= [1 3]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 2]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [1 3]
decision_steps.agent_id= [0 2]
decision_steps.agent_id= [1 3]
pp_terminal_steps.agent_id []
pp_decision_steps.agent_id [0 

In [None]:
env = envunity
# We will only consider the first Behavior
behavior_names = list(env.behavior_specs)

for behavior_name in behavior_names:

    print(f"\nName of the behavior : {behavior_name}")
    spec = env.behavior_specs[behavior_name]



    # Examine the number of observations per Agent
    print("Number of observations : ", len(spec.observation_specs))
    print('\n')
    for spec1 in spec.observation_specs:
        print(spec1)
    print('\n')

    # Is there a visual observation ?
    # Visual observation have 3 dimensions: Height, Width and number of channels
    vis_obs = any(len(spec.shape) == 3 for spec in spec.observation_specs)
    print("Is there a visual observation ?", vis_obs)
    print('\n')


    # Is the Action continuous or multi-discrete ?
    if spec.action_spec.continuous_size > 0:
      print(f"There are {spec.action_spec.continuous_size} continuous actions")
    if spec.action_spec.is_discrete():
      print(f"There are {spec.action_spec.discrete_size} discrete actions")
    print(spec.action_spec)


    # How many actions are possible ?
    #print(f"There are {spec.action_size} action(s)")

    # For discrete actions only : How many different options does each action has ?
    if spec.action_spec.discrete_size > 0:
      for action, branch_size in enumerate(spec.action_spec.discrete_branches):
        print(f"Action number {action} has {branch_size} different options")
        
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    print('\n')
    print('len decision_steps', len(decision_steps))
    print('decision_steps.obs', decision_steps.obs[0].shape,decision_steps.obs[1].shape)
    print(np.concatenate([decision_steps.obs[0],decision_steps.obs[1]],axis=1).shape)
    print('decision_steps.rewards, ', decision_steps.reward)
    print('terminal_steps.obs, ', terminal_steps.obs)
    
    print('\n')
    for agent_id_terminated in decision_steps:
            # Create its last experience (is last because the Agent terminated)

        print(decision_steps[agent_id_terminated].reward)
        #print(decision_steps[agent_id_terminated].obs[0].shape)



In [None]:
env.set_actions(behavior_name, spec.action_spec.empty_action(len(decision_steps)))

In [None]:
env.step()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

for index, obs_spec in enumerate(spec.observation_specs):
    if len(obs_spec.shape) == 3:
        print("Here is the first visual observation")
        plt.imshow(decision_steps.obs[index][0,:,:,:])
        plt.show()

for index, obs_spec in enumerate(spec.observation_specs):
    if len(obs_spec.shape) == 1:
        print("First vector observations : ", decision_steps.obs[index][0,:])

In [None]:
from typing import Dict
for episode in range(1):
    env.reset()
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    tracked_agent = -1 # -1 indicates not yet tracking
    done = False # For the tracked_agent
    episode_rewards = 0 # For the tracked_agent
    
    dict_last_action_from_agent: Dict[int, np.ndarray] = {}
    
    while not done:
        # Track the first agent we see if not tracking
        # Note : len(decision_steps) = [number of agents that requested a decision]
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0]
            print(decision_steps.agent_id)

        # Generate an action for all agents
        action = spec.action_spec.random_action(len(decision_steps))
        #print('len(decision_steps)',len(decision_steps))
        #print('action',action.discrete)

        # Set the actions
        env.set_actions(behavior_name, action)

        # Move the simulation forward
        env.step()

        # Get the new simulation results
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        print(decision_steps[1])
        if tracked_agent in decision_steps: # The agent requested a decision
            episode_rewards += decision_steps[tracked_agent].reward
        if tracked_agent in terminal_steps: # The agent terminated its episode
            episode_rewards += terminal_steps[tracked_agent].reward
            done = True
        print('episode_rewards = ',episode_rewards)
    print(f"Total rewards for episode {episode} is {episode_rewards}")


In [None]:
count = 0
while True:
    if count > 500000:
        break
    for name in behavior_names:
        states = env.get_steps(name)
        
        # 在此添加算法
        actions = ActionTuple()
        
        # 测试时让4个agent向四个方向随机移动 使用中应改为算法提供的action
        ac = np.random.randint(0, 5, size=48).reshape(-1, 3)
        actions.add_discrete(ac)

        env.set_actions(name, actions)
        
        DecisionSteps, TerminalSteps = env.get_steps(name)
        print(DecisionSteps,TerminalSteps)
        
    count += 1
    
#env.close()

In [None]:
from unityagents import UnityEnvironment
import numpy as np

Next, we will start the environment!  **_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.

- **Mac**: `"path/to/Soccer.app"`
- **Windows** (x86): `"path/to/Soccer_Windows_x86/Soccer.exe"`
- **Windows** (x86_64): `"path/to/Soccer_Windows_x86_64/Soccer.exe"`
- **Linux** (x86): `"path/to/Soccer_Linux/Soccer.x86"`
- **Linux** (x86_64): `"path/to/Soccer_Linux/Soccer.x86_64"`
- **Linux** (x86, headless): `"path/to/Soccer_Linux_NoVis/Soccer.x86"`
- **Linux** (x86_64, headless): `"path/to/Soccer_Linux_NoVis/Soccer.x86_64"`

For instance, if you are using a Mac, then you downloaded `Soccer.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:
```
env = UnityEnvironment(file_name="Soccer.app")
```

In [None]:
env = UnityEnvironment(file_name="C://Users//raman//Documents//Pengsong//MIE1075_Soccer//buildmysoccer//SoccerTwos_BurstDebugInformation_DoNotShip//UnityEnvironment")
#C://Users//raman//Documents//Pengsong//MIE1075_Soccer//buildmysoccer//UnityEnvironment
#C://Users//raman//Downloads//Soccer_Windows_x86_64//Soccer_Windows_x86_64//Soccer

In [None]:
env = UnityEnvironment(file_name="C://Users//raman//Documents//Pengsong//MIE1075_Soccer//buildmysoccer//SoccerTwos")

In [None]:
# print the brain names
print(env.brain_names)

# set the goalie brain
g_brain_name = env.brain_names[0]
g_brain = env.brains[g_brain_name]

# set the striker brain
s_brain_name = env.brain_names[1]
s_brain = env.brains[s_brain_name]

print(env)

### 2. Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)

# number of agents 
num_g_agents = len(env_info[g_brain_name].agents)
print('Number of goalie agents:', num_g_agents)
num_s_agents = len(env_info[s_brain_name].agents)
print('Number of striker agents:', num_s_agents)

# number of actions
g_action_size = g_brain.vector_action_space_size
print('Number of goalie actions:', g_action_size)
s_action_size = s_brain.vector_action_space_size
print('Number of striker actions:', s_action_size)

# examine the state space 
g_states = env_info[g_brain_name].vector_observations
g_state_size = g_states.shape[1]
print('There are {} goalie agents. Each receives a state with length: {}'.format(g_states.shape[0], g_state_size))
s_states = env_info[s_brain_name].vector_observations
s_state_size = s_states.shape[1]
print('There are {} striker agents. Each receives a state with length: {}'.format(s_states.shape[0], s_state_size))

### 3. Take Random Actions in the Environment

In the next code cell, you will learn how to use the Python API to control the agents and receive feedback from the environment.

Once this cell is executed, you will watch the agents' performance, if they select actions at random with each time step.  A window should pop up that allows you to observe the agents.

Of course, as part of the project, you'll have to change the code so that the agents are able to use their experiences to gradually choose better actions when interacting with the environment!

In [None]:
for i in range(0):                                         # play game for 2 episodes
    env_info = env.reset(train_mode=False)                 # reset the environment    
    g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
    s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
    g_scores = np.zeros(num_g_agents)                      # initialize the score (goalies)
    s_scores = np.zeros(num_s_agents)                      # initialize the score (strikers)
    while True:
        # select actions and send to environment
        g_actions = np.random.randint(g_action_size, size=num_g_agents)
        s_actions = np.random.randint(s_action_size, size=num_s_agents)

        actions = dict(zip([g_brain_name, s_brain_name], 
                           [g_actions, s_actions]))
        print(actions)
        env_info = env.step(actions)                       
        
        # get next states
        g_next_states = env_info[g_brain_name].vector_observations         
        s_next_states = env_info[s_brain_name].vector_observations  
        
        # get reward and update scores
        g_rewards = env_info[g_brain_name].rewards  
        s_rewards = env_info[s_brain_name].rewards
        g_scores += g_rewards
        s_scores += s_rewards
        
        # check if episode finished
        done = np.any(env_info[g_brain_name].local_done)  
        
        # roll over states to next time step
        g_states = g_next_states
        s_states = s_next_states
        
        # exit loop if episode finished
        if done:                                           
            break
    print('Scores from episode {}: {} (goalies), {} (strikers)'.format(i+1, g_scores, s_scores))

When finished, you can close the environment.

In [None]:
# env.close()

### 4. It's Your Turn!

Now it's your turn to train your own agent to solve the environment!  When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:
```python
env_info = env.reset(train_mode=True)[brain_name]
```

In [None]:
%matplotlib inline

import torch

from maddpg import MADDPG
from ddpg_agent import Agent

from collections import deque
import matplotlib.pyplot as plt
import time, os

# from maddpg import MADDPG
# from buffer import ReplayBuffer
# from utilities import transpose_list, transpose_to_tensor
# from tensorboardX import SummaryWriter

In [None]:
g_maddpg = MADDPG(336, 4, 2, 1976)
g_agent = Agent(336,4,2,1976)

s_maddpg = MADDPG(336, 6, 2, 1976)
s_agent = Agent(336,6,2,1976)

In [None]:
g_scores_max_hist = []
g_scores_mean_hist = []

s_scores_max_hist = []
s_scores_mean_hist = []

def maddpg_train(n_episodes=3000):
    
    g_scores_deque = deque(maxlen=100)
    s_scores_deque = deque(maxlen=100)

    
    g_solved = False
    s_solved = False
    
    for i_episode in range(n_episodes):
        
        env_info = env.reset(train_mode=True)                 # reset the environment   
        g_states = env_info[g_brain_name].vector_observations
        s_states = env_info[s_brain_name].vector_observations

        
        g_scores = np.zeros(num_g_agents)
        s_scores = np.zeros(num_s_agents)
        
        g_maddpg.reset()
        s_maddpg.reset()
        
        step = 0
        
        while True:
            step += 1
            g_actions = g_maddpg.act(g_states, i_episode, add_noise=False)
            s_actions = s_maddpg.act(s_states, i_episode, add_noise=False)

            g_actions = np.argmax(g_actions,1)
            s_actions = np.argmax(s_actions,1)
                                 
            actions = dict(zip([g_brain_name, s_brain_name], 
                               [g_actions, s_actions]))
            
            env_info = env.step(actions) 
            
            # get next states
            g_next_states = env_info[g_brain_name].vector_observations         
            s_next_states = env_info[s_brain_name].vector_observations
            
            # get reward and update scores
            g_rewards = env_info[g_brain_name].rewards  
            s_rewards = env_info[s_brain_name].rewards
            g_scores += g_rewards
            s_scores += s_rewards

            done = env_info[g_brain_name].local_done
            
            
            g_maddpg.step(i_episode, g_states, g_actions, g_rewards, g_next_states, done)
            s_maddpg.step(i_episode, s_states, s_actions, s_rewards, s_next_states, done)
            
            if np.any(done):
                break
                
             # roll over states to next time step
            g_states = g_next_states
            s_states = s_next_states
            
        g_score_max = np.max(g_scores)
        g_scores_deque.append(g_score_max)
        g_score_mean = np.mean(g_scores_deque)
        g_scores_max_hist.append(g_score_max)
        g_scores_mean_hist.append(g_score_mean)
        
        s_score_max = np.max(s_scores)
        s_scores_deque.append(s_score_max)
        s_score_mean = np.mean(s_scores_deque)
        s_scores_max_hist.append(s_score_max)
        s_scores_mean_hist.append(s_score_mean)

        print('\r{} episode\tavg g_score {:.5f}\tmax score {:.5f}'.format(i_episode, np.mean(g_scores_deque), g_score_max), end='')
        if g_solved == False and g_score_mean >= 0.5:
            print('\nEnvironment g_solved after {} episodes with the average score {}\n'.format(i_episode, g_score_mean))
            g_maddpg.save("g_")
            g_solved = True
            
        print('\r{} episode\tavg s_score {:.5f}\tmax score {:.5f}'.format(i_episode, np.mean(s_scores_deque), s_score_max), end='')
        if s_solved == False and s_score_mean >= 0.5:
            print('\nEnvironment g_solved after {} episodes with the average score {}\n'.format(i_episode, s_score_mean))
            s_maddpg.save("s_")
            s_solved = True  
            
        if i_episode % 5 == 0:
            print()
    

In [None]:
maddpg_train()

In [None]:
agent.actor_local.load_state_dict(torch.load('checkpoint_agent0_actor.pth', map_location='cpu'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic0_critic.pth', map_location='cpu'))
agent.actor_local.load_state_dict(torch.load('checkpoint_agent1_actor.pth', map_location='cpu'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic1_critic.pth', map_location='cpu'))

for i in range(5):                                         # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = agent.act(states,i, add_noise= False)                      # select actions from loaded model agent
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
for i in range(5):                                         # play game for 2 episodes
    env_info = env.reset(train_mode=False)                 # reset the environment    
    g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
    s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
    g_scores = np.zeros(num_g_agents)                      # initialize the score (goalies)
    s_scores = np.zeros(num_s_agents)                      # initialize the score (strikers)
    while True:
        # select actions and send to environment
        g_actions = np.random.randint(g_action_size, size=num_g_agents)
        s_actions = np.random.randint(s_action_size, size=num_s_agents)
        print(g_actions,s_actions)
        actions = dict(zip([g_brain_name, s_brain_name], 
                           [g_actions, s_actions]))
        print(actions)
        env_info = env.step(actions)                       
        
        # get next states
        g_next_states = env_info[g_brain_name].vector_observations         
        s_next_states = env_info[s_brain_name].vector_observations
        
        # get reward and update scores
        g_rewards = env_info[g_brain_name].rewards  
        s_rewards = env_info[s_brain_name].rewards
        g_scores += g_rewards
        s_scores += s_rewards
        
        # check if episode finished
        done = np.any(env_info[g_brain_name].local_done)  
        
        # roll over states to next time step
        g_states = g_next_states
        s_states = s_next_states
        
        # exit loop if episode finished
        if done:                                           
            break
    print('Scores from episode {}: {} (goalies), {} (strikers)'.format(i+1, g_scores, s_scores))