In [1]:
from __future__ import absolute_import, division, print_function

import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import argparse
import os

import warnings ; warnings.filterwarnings('ignore')
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['OMP_NUM_THREADS'] = '1'
import argparse
import collections
import os
import random
import tempfile
from argparse import RawTextHelpFormatter

import gfootball.env as football_env
import gym
import numpy as np
import ray
import ray.cloudpickle as cloudpickle
import torch as T
from gfootball import env as fe
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.spaces.space_utils import flatten_to_single_ndarray
from ray.tune.registry import get_trainable_cls, register_env
from rldm.utils import football_tools as ft
from rldm.utils import gif_tools as gt
from rldm.utils import system_tools as st
from IPython.display import HTML
import matplotlib.pyplot as plt
import pandas as pd

from collections import deque
from torch.nn import functional as F

import time



In [5]:
env_name = '3_vs_3_auto_GK'
env = ft.RllibGFootball(env_name,write_video=False, render=False)

In [2]:
#Create a class Replay buffer with methods to store new experiences and randomly sample
!rm -rf /tmp/football/*

class Replay_Buffer():
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.replay_buffer = deque(maxlen = self.buffer_size)

        
    def new_experience(self, player_0_obs, player_1_obs, player_0_r, player_1_r, player_0_action, \
                       player_1_action, player_0_next_obs, player_1_next_obs,terms):
        
        #Add the new experience to the deque list
        self.replay_buffer.append([player_0_obs, player_1_obs, player_0_r, player_1_r, player_0_action, \
                       player_1_action, player_0_next_obs, player_1_next_obs,terms])
        
    def sample_experiences(self, batch_size):
        self.batch_size = batch_size
        random_sample = random.sample(self.replay_buffer,self.batch_size)
        return random_sample
    
    def num_experiences(self):
        return len(self.replay_buffer)

    
#The MADDPG for this application will have 4 total neural networks. One for actor and one for critic. Each actor and critic /
#has a target and learning NN
    
#This is the actor NN class. Takes in local observations and returns what action should be taken
class ActorNN(nn.Module):
    def __init__(self, local_observation_length, action_space_length, hidden_layer_size):
        super(ActorNN,self).__init__()
        #Base case is 2 hidden layers
        self.L1 = nn.Linear(local_observation_length,hidden_layer_size)
        self.L23 = nn.Linear(hidden_layer_size,hidden_layer_size)
        self.L4 = nn.Linear(hidden_layer_size, action_space_length)
        self.relu = T.nn.ReLU()
        
        self.optimizer = optim.Adam(self.parameters(),lr = 0.01)
        
    def forward(self, local_observations):
        output = self.L1(local_observations)
        output = self.relu(output)
        output = self.L23(output)
        output = self.relu(output)
        
        #added to enable the gumbel_softmax
        self.out_fn = lambda x: x
        
        return self.out_fn(self.L4(output))
   
    #a different forward just for choosing action
#     def forward_action(self, local_observations):
#         output = self.L1(local_observations)
#         output = self.relu(output)
#         output = self.L23(output)
#         output = self.relu(output)
        
#         #added to enable the gumbel_softmax
#         self.out_fn = lambda x: x
        
#         return self.out_fn(self.L4(output))

#Define the critic class that for each agent takes in the entire observation space and the actions that both agents are doing /
#and outputs a single value for what the value is
class CriticNN(nn.Module):
    def __init__(self,full_observation_length,action_space_length, hidden_layer_size):
        super(CriticNN,self).__init__()
        
        self.l1 = nn.Linear(full_observation_length + action_space_length, hidden_layer_size)
        self.l23 = nn.Linear(hidden_layer_size,hidden_layer_size)
        self.l4 = nn.Linear(hidden_layer_size, 1)
        self.relu = T.nn.ReLU()

        self.optimizer = optim.Adam(self.parameters(),lr = 0.01)
        
    def forward(self, observations, actions):
        output = self.l1(T.cat([observations,actions],dim=1))
        output = self.relu(output)
        output = self.l23(output)
        output = self.relu(output)
        
        return self.l4(output)

     

In [3]:
#Define a class for a football player 

class Player():
    def __init__(self, gamma, tau, n_actions):
        
        self.gamma = gamma
        self.tau = tau
        self.n_actions = n_actions
        
        self.Actor_trainer = ActorNN(43,19,64)
        self.Actor_target = ActorNN(43,19,64)
        self.Critic_trainer = CriticNN(43,19*2,64)
        self.Critic_target = CriticNN(43,19*2,64)
        
        self.update_network_parameters(tau=1)
        
    def choose_action(self, local_observation): #takes in an np.array of the local observations for that agent
        
        local_observation = T.tensor([local_observation],dtype=T.float)
        action = self.Actor_trainer.forward(local_observation).detach()
#         exploration = T.rand(self.n_actions) #inject some noise into the action selection process 
#         action = action + exploration
        
        #added for gumbel soft_max
        action = F.gumbel_softmax(action, hard=True)
        action = action.detach().cpu().numpy()[0]
        
        return action
    
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_actor_params = self.Actor_target.named_parameters()
        actor_params = self.Actor_trainer.named_parameters()

        target_actor_state_dict = dict(target_actor_params)
        actor_state_dict = dict(actor_params)
        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                    (1-tau)*target_actor_state_dict[name].clone()

        self.Actor_target.load_state_dict(actor_state_dict)

        target_critic_params = self.Critic_target.named_parameters()
        critic_params = self.Critic_trainer.named_parameters()

        target_critic_state_dict = dict(target_critic_params)
        critic_state_dict = dict(critic_params)
        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                    (1-tau)*target_critic_state_dict[name].clone()

        self.Critic_target.load_state_dict(critic_state_dict)
    
#define a class for the main MADDPG learning process

class MADDPG_learning():
    def __init__(self, num_players, num_samples, tau = 0.01, gamma = 0.95):
        
        self.num_players = num_players
        self.players = []
        self.batch_size = num_samples #how many samples to grab when getting memory
        
        #create a list of the two player class. one for player 0 and one for player 1
        for i in range(self.num_players):
            self.players.append(Player(gamma=gamma, tau=tau, n_actions = 19))
        
    def choosing_actions(self, total_observations):
        #total observations are the observations from both player's perspective. Just feed the total observation dictionary into it
        actions = []
        self.total_observations = total_observations
        self.player_0_observation = self.total_observations['player_0']
        self.player_1_observation = self.total_observations['player_1']

        for player_idx, player in enumerate(self.players):
            if player_idx==0: #player 0
                actions.append(player.choose_action(self.player_0_observation))

            if player_idx==1: #player 1
                actions.append(player.choose_action(self.player_1_observation))
        
        return actions #will be a list of integers

    #pass in the current replay_buffer as it stands.
    def MDDPG_learn(self, replay_buffer):
#         self.num_experiences = len(replay_buffer)

#             self.replay_buffer.append([player_0_obs, player_1_obs, player_0_r, player_1_r, player_0_action, \
#                        player_1_action, player_0_next_obs, player_1_next_obs,terms])

        training_sample = replay_buffer.sample_experiences(self.batch_size)

        player_0_obs = T.tensor([item[0] for item in training_sample], dtype=T.float)
        player_1_obs = T.tensor([item[1] for item in training_sample], dtype=T.float)
        player_0_r = T.tensor([item[2] for item in training_sample], dtype=T.float)
        player_1_r = T.tensor([item[3] for item in training_sample], dtype=T.float)
        player_0_action = T.tensor([item[4] for item in training_sample], dtype=T.float)
        player_1_action = T.tensor([item[5] for item in training_sample], dtype=T.float)
        player_0_next_obs = T.tensor([item[6] for item in training_sample], dtype=T.float)
        player_1_next_obs = T.tensor([item[7] for item in training_sample], dtype=T.float)
        terminals = T.tensor([item[8] for item in training_sample], dtype=T.float)

        #update the critic NN and Actor NN for each player

        #Get the new states the experiences arrived at, determine the action the critics would take for each agent
        for player_idx, player in enumerate(self.players):
            if player_idx==0: #player 0
                #ActorNN class takes in 43 and outputs 19
                new_action_0 = hot_shot(player.Actor_target.forward(player_0_next_obs).detach())
#                 pol_action_0 = player.Actor_trainer.forward(player_0_obs).detach()
                pol_action_0_g = gumbel_rumbel(player.Actor_trainer.forward(player_0_obs))
                pol_action_0_hs = hot_shot(player.Actor_trainer.forward(player_0_obs))

            if player_idx==1: #player 1
                new_action_1 = hot_shot(player.Actor_target.forward(player_1_next_obs).detach())
#                 pol_action_1 = player.Actor_trainer.forward(player_1_obs).detach()
                pol_action_1_g = gumbel_rumbel(player.Actor_trainer.forward(player_1_obs))
                pol_action_1_hs = hot_shot(player.Actor_trainer.forward(player_1_obs))
                
        #Concatenate together the new actions that each of the 
        new_actions = T.cat([new_action_0,new_action_1], dim=1)
        old_actions = T.cat([player_0_action,player_1_action], dim=1)
#         pol_actions_0 = T.cat([pol_action_0,pol_action_1], dim=1)
        pol_actions_0 = T.cat([pol_action_0_g,pol_action_1_hs], dim=1)
        pol_actions_1 = T.cat([pol_action_0_hs,pol_action_1_g], dim=1)

        #now update each players CriticNN
        for player_idx, player in enumerate(self.players):

            #Use player_0's observation as the "global" observation
            #Take the "global" observations and all actions as input for critic
            critic_value_ = player.Critic_target.forward(player_0_next_obs, new_actions).flatten()
            critic_value = player.Critic_trainer.forward(player_0_obs, old_actions).flatten()
#             critic_value_[terminals[:,0]] = 0.0

            if player_idx==0: #player 0
                target = player_0_r + player.gamma*critic_value_*(1-terminals)
                critic_loss = F.mse_loss(target,critic_value)
                player.Critic_trainer.optimizer.zero_grad()
                critic_loss.backward(retain_graph=True)
                player.Critic_trainer.optimizer.step()






            if player_idx==1: #player 1
                target = player_1_r + player.gamma*critic_value_*(1-terminals)
                critic_loss = F.mse_loss(target,critic_value)
                player.Critic_trainer.optimizer.zero_grad()
#                 T.autograd.set_detect_anomaly(True)
                critic_loss.backward(retain_graph=True)
                player.Critic_trainer.optimizer.step()
                #Finished updating the Critic NN

            #Now update the Actor NN
            if player_idx==0:
            
                actor_loss = player.Critic_trainer.forward(player_0_obs, pol_actions_0).flatten()
                actor_loss = -T.mean(actor_loss)
                player.Actor_trainer.optimizer.zero_grad()
                actor_loss.backward(retain_graph=True)
                player.Actor_trainer.optimizer.step()
                
            if player_idx==0:
                
                actor_loss = player.Critic_trainer.forward(player_0_obs, pol_actions_1).flatten()
                actor_loss = -T.mean(actor_loss)
                player.Actor_trainer.optimizer.zero_grad()
                actor_loss.backward(retain_graph=True)
                player.Actor_trainer.optimizer.step()
                


            #Now update the Actor and Critic target networks like in the MADDPG paper gradually using tau
            player.update_network_parameters()

#takes in the logit output tensor and outputs tensor with argmax
def hot_shot(logits):
    
    
    for i in range(1024):
        
        array = logits[i,:].detach().numpy()
        max_idx = np.argmax(array)
        array = np.zeros(19)
        array[max_idx] = 1.0

        if i==0:
            hot_shots = T.tensor(array, dtype=T.float)
        else:   
            hot_shots = T.vstack((hot_shots,T.tensor(array, dtype=T.float)))
    
    return hot_shots #return a tensor of the same size that was input but with a hot shot

def gumbel_rumbel(logits):
    #Takes as input a tensor with 1024 rows and 19 columns and returns a a tensor with same shape with gumbel max per row
    for i in range(1024):

        row = logits[i,:]
        row = F.gumbel_softmax(row, hard=True)
    

        if i==0:
            gumbels = row
        else:   
            gumbels = T.vstack((gumbels,row))

    return gumbels 


In [12]:
#Finally, with all classes and methods defined, create the script
#to run MADDPG learning

env_name = '3_vs_3_auto_GK'
env = ft.RllibGFootball(env_name,write_video=False, render=False)

#Initialize the players and replay buffer
players = MADDPG_learning(2,1024)
experience_replay = Replay_Buffer(1000000)

beginning_time = time.time()

n_training_episodes = 10000 #How many episodes to run the training for
# score_tracker = [] #Capture the end of episode score every couple episodes
total_steps = 0
wins = np.zeros(n_training_episodes)
avg_win_rate = []

for i in range(n_training_episodes):
    #initialize the GFootball environment
    score = 0
    obs = env.reset()
    
    done = False
    while not done:
        actions = players.choosing_actions(obs)
        player_0_act = int(np.where(actions[0]==1)[0])
        player_1_act = int(np.where(actions[1]==1)[0])
        
        actions = {'player_0':player_0_act,'player_1':player_1_act}
        
        next_obs, rewards, dones, infos = env.step(actions)
        player_0_obs = obs['player_0']
        player_1_obs = obs['player_1']
        player_0_r = rewards['player_0']
        player_1_r = rewards['player_1']
        player_0_action = np.zeros(19)
        player_1_action = np.zeros(19)
        player_0_action[player_0_act] = 1.0
        player_1_action[player_1_act] = 1.0
        player_0_next_obs = next_obs['player_0']
        player_1_next_obs = next_obs['player_1']
        terms = dones['__all__']
        
        
        
        experience_replay.new_experience(player_0_obs, player_1_obs, player_0_r, player_1_r, player_0_action, \
                       player_1_action, player_0_next_obs, player_1_next_obs,terms)
        

        if total_steps % 40 == 0:
            if len(experience_replay.replay_buffer) > 1024:
                players.MDDPG_learn(experience_replay)
                
        
        if i % 100 == 0: #Every 100 episodes, get the average win rate over the past 100 episodes
            if infos['player_1']['game_info']['steps_left'] == 500:
                avg_win_rate.append(wins[-99:-1].mean())
                print('episodes: {}'.format(i),'avg win rate: {}'.format(wins[-99:-1].mean()))
            
        
        obs = next_obs
#         score_tracker.append()
        total_steps+=1
        
    
        if dones['__all__']==True:
            
            game_result = 0 if infos['player_0']['score_reward'] == -1 else \
            1 if infos['player_0']['score_reward'] == 1 else 0
            
            wins[i] = game_result
            
            done=True

        
    
    
ending_time = time.time() 
    
print('total duration (mins): {}'.format((ending_time-beginning_time)/60))
    




episodes: 0 avg win rate: 0.0
episodes: 100 avg win rate: 0.0
episodes: 200 avg win rate: 0.0
episodes: 300 avg win rate: 0.0
episodes: 400 avg win rate: 0.0
episodes: 500 avg win rate: 0.0
episodes: 600 avg win rate: 0.0
episodes: 700 avg win rate: 0.0
episodes: 800 avg win rate: 0.0
episodes: 900 avg win rate: 0.0
episodes: 1000 avg win rate: 0.0
episodes: 1100 avg win rate: 0.0
episodes: 1200 avg win rate: 0.0
episodes: 1300 avg win rate: 0.0
episodes: 1400 avg win rate: 0.0
episodes: 1500 avg win rate: 0.0
episodes: 1600 avg win rate: 0.0
episodes: 1700 avg win rate: 0.0
episodes: 1800 avg win rate: 0.0
episodes: 1900 avg win rate: 0.0
episodes: 2000 avg win rate: 0.0
episodes: 2100 avg win rate: 0.0
episodes: 2200 avg win rate: 0.0
episodes: 2300 avg win rate: 0.0
episodes: 2400 avg win rate: 0.0
episodes: 2500 avg win rate: 0.0
episodes: 2600 avg win rate: 0.0
episodes: 2700 avg win rate: 0.0
episodes: 2800 avg win rate: 0.0
episodes: 2900 avg win rate: 0.0
episodes: 3000 avg win

In [4]:
# save the models
for player_idx, player in enumerate(players.players):
    T.save(player.Actor_trainer.state_dict(), '/mnt/rldm/notebooks/model_{}_v4.pth'.format(player_idx))

# # load the model for player 0
player_0 = Player(0.95,0.01,19)
player_0.Actor_trainer.load_state_dict(T.load('/mnt/rldm/notebooks/model_{}_v4.pth'.format(0)))

# #load the model for player 1
player_1 = Player(0.95,0.01,19)
player_1.Actor_trainer.load_state_dict(T.load('/mnt/rldm/notebooks/model_{}_v4.pth'.format(1)))

<All keys matched successfully>

In [8]:
#Test the trained agents using just the Actor_trainer for each player

#Finally, with all classes and methods defined, create the script
#to run MADDPG learning

env_name = '3_vs_3_auto_GK'
env = ft.RllibGFootball(env_name,write_video=False, render=False)


beginning_time = time.time()

n_training_episodes = 1 #How many episodes to run the training for
# score_tracker = [] #Capture the end of episode score every couple episodes
total_steps = 0
wins = np.zeros(n_training_episodes)
avg_win_rate = []

#Metrics for activity and distance from each other
player_0_active_cache = np.zeros((n_training_episodes,1000))
player_1_active_cache = np.zeros((n_training_episodes,1000))
distance = np.zeros((n_training_episodes,1000))
ball_owner = np.zeros((n_training_episodes,1000))

for i in range(n_training_episodes):
    #initialize the GFootball environment
    score = 0
    obs = env.reset()
    
    done = False
    counter = 0
    while not done:
        action_0 = player_0.choose_action(obs['player_0'])
        action_1 = player_1.choose_action(obs['player_1'])
        player_0_act = int(np.where(action_0==1)[0])
        player_1_act = int(np.where(action_1==1)[0])
        
        actions = {'player_0':player_0_act,'player_1':player_1_act}
        
        next_obs, rewards, dones, infos = env.step(actions)
        player_0_obs = obs['player_0']
        player_1_obs = obs['player_1']
        player_0_r = rewards['player_0']
        player_1_r = rewards['player_1']
        player_0_action = np.zeros(19)
        player_1_action = np.zeros(19)
        player_0_action[player_0_act] = 1.0
        player_1_action[player_1_act] = 1.0
        player_0_next_obs = next_obs['player_0']
        player_1_next_obs = next_obs['player_1']
        terms = dones['__all__']
        

        player_0_active_cache[i,counter] = obs['player_0'][34]
        player_1_active_cache[i,counter] = obs['player_0'][35]
        distance[i,counter] = np.linalg.norm(obs['player_0'][2:4] - obs['player_0'][4:6])
        ball_owner[i,counter] = infos['player_0']['game_info']['ball_owned_player']
        
        
#         experience_replay.new_experience(player_0_obs, player_1_obs, player_0_r, player_1_r, player_0_action, \
#                        player_1_action, player_0_next_obs, player_1_next_obs,terms)
        

#         if total_steps % 100 == 0:
#             if len(experience_replay.replay_buffer) > 1024:
#                 players.MDDPG_learn(experience_replay)
                
        
#         if i % 100 == 0: #Every 100 episodes, get the average win rate over the past 100 episodes
#             if infos['player_1']['game_info']['steps_left'] == 500:
#                 avg_win_rate.append(wins[-99:-1].mean())
#                 print('episodes: {}'.format(i),'avg win rate: {}'.format(wins[-99:-1].mean()))
            
        
        obs = next_obs
#         score_tracker.append()
        total_steps += 1
        counter += 1
    
        if dones['__all__']==True:
            
            game_result = 0 if infos['player_0']['score_reward'] == -1 else \
            1 if infos['player_0']['score_reward'] == 1 else 0
            
            wins[i] = game_result
            
            done=True

        
    
    
ending_time = time.time() 
    
# print('total duration (mins): {}'.format((ending_time-beginning_time)/60))
    

# data = gt.get_gif_html(videos_path="/tmp/football/episode_done_*.avi", 
#                        title="Full episodes")
# HTML(data=data)






In [63]:
#Run this block of code to get the metrics that were created from the previous testing cell

# # #Data from the test and save in csv
distance_n0_v1 = np.delete(distance, np.argwhere(np.all(distance[..., :] == 0, axis=0)), axis=1)


# # #Aggregation for player 0 and 1 distance from each other 
avg_distance_in_episode_v1 = np.mean(distance_n0_v1,axis=1)

# #Aggregation data for owns the ball
ball_owner = pd.DataFrame(ball_owner)
ball_owner_agg =pd.DataFrame()
ball_owner_agg['player_0'] = (ball_owner==1).sum(axis=1)
ball_owner_agg['player_1'] = (ball_owner==2).sum(axis=1)


## Sent it to CSV
pd.DataFrame(avg_distance_in_episode_v1).to_csv('avg_distance_in_episode_v1.csv')
ball_owner_agg.to_csv('ball_owner_v1.csv')