In [None]:
#For Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#For Colab
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
!pip install gdown
import gdown

url = #Google drive saved model link
output = 'ppo_3.6M_steps_scenario_12.zip'
gdown.download(url, output, quiet=False)

In [None]:
%%capture
%%bash
# dependencies
apt-get -y update > /dev/null
apt-get -y install libsdl2-gfx-dev libsdl2-ttf-dev > /dev/null

# cloudpickle, pytorch, gym
pip3 install "cloudpickle==1.3.0"
pip3 install "torch==1.5.1"
pip3 install "gym==0.17.2"

# gfootball
GRF_VER=v2.8
GRF_PATH=football/third_party/gfootball_engine/lib
GRF_URL=https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_${GRF_VER}.so
git clone -b ${GRF_VER} https://github.com/google-research/football.git
mkdir -p ${GRF_PATH}
wget -q ${GRF_URL} -O ${GRF_PATH}/prebuilt_gameplayfootball.so
cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install . && cd ..

# kaggle-environments
git clone https://github.com/Kaggle/kaggle-environments.git
cd kaggle-environments && pip3 install . && cd ..

# stable-baselines3
git clone https://github.com/swapnamoy17/stable-baselines3.git
cd stable-baselines3 && pip3 install . && cd ..

# housekeeping
rm -rf football kaggle-environments stable-baselines3

In [None]:
!pip install sb3-contrib

In [None]:
import os
from collections import OrderedDict
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
from kaggle_environments import make
from kaggle_environments.envs.football.helpers import *
from gfootball.env import create_environment, observation_preprocessing
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env import VecTransposeImage
from IPython.display import HTML
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
#Google-Football
class FootballGym(gym.Env):
    spec = None
    metadata = None
    
    def __init__(self, config=None):
        super(FootballGym, self).__init__()
        env_name = "11_vs_11_easy_stochastic"
        rewards = "scoring,checkpoints"
        if config is not None:
            env_name = config.get("env_name", env_name)
            rewards = config.get("rewards", rewards)
        self.env = create_environment(
            env_name=env_name,
            stacked=False,
            representation="raw",
            rewards = rewards,
            write_goal_dumps=False,
            write_full_episode_dumps=False,
            render=False,
            write_video=False,
            dump_frequency=1,
            logdir=".",
            extra_players=None,
            number_of_left_players_agent_controls=1,
            number_of_right_players_agent_controls=0)  
        self.action_space = Discrete(19)
        self.observation_space = Box(low=0, high=255, shape=(72, 96, 16), dtype=np.uint8)
        self.reward_range = (-1, 1)
        self.obs_stack = deque([], maxlen=4)
        
    def transform_obs(self, raw_obs):
        obs = raw_obs[0]
        obs = observation_preprocessing.generate_smm([obs])
        if not self.obs_stack:
            self.obs_stack.extend([obs] * 4)
        else:
            self.obs_stack.append(obs)
        obs = np.concatenate(list(self.obs_stack), axis=-1)
        obs = np.squeeze(obs)
        return obs

    def reset(self):
        self.obs_stack.clear()
        obs = self.env.reset()
        obs = self.transform_obs(obs)
        return obs
    
    def step(self, action):
        obs, reward, done, info = self.env.step([action])
        obs = self.transform_obs(obs)
        return obs, float(reward), done, info
    
check_env(env=FootballGym(), warn=True)

In [None]:
#CNN model
class FootballCNN(BaseFeaturesExtractor):
    def __init__(self,observation_space,features_dim = 256):
        super(FootballCNN,self).__init__(observation_space, features_dim)
        in_channels = observation_space.shape[0] # channels x height x width
        self.cnn = nn.Sequential(OrderedDict([('conv1_depthwise', nn.Conv2d(16,16,3, stride=2, padding=1, groups=16)),
                        ('conv1_pointwise', nn.Conv2d(16,32,1)),
                        ('Relu1', nn.ReLU()),
                        ('Pooling layer',nn.MaxPool2d(kernel_size=3, stride=2, dilation=1, ceil_mode=False)),
                        ('conv2_depthwise', nn.Conv2d(32,32,3, stride=2, padding=1, groups=32)),
                        ('conv2_pointwise', nn.Conv2d(32,64,1)),
                        ('Relu2', nn.ReLU()),
                        ('conv3_depthwise', nn.Conv2d(64,64,3, stride=2, padding=1, groups=64)),
                        ('conv3_pointwise', nn.Conv2d(64,128,1)),
                        ('Relu3', nn.ReLU()),
                        ('Flatten', nn.Flatten())]))
        self.linear = nn.Sequential(OrderedDict([('Linear',nn.Linear(in_features=3840, out_features=features_dim, bias=True)),
                          ('Relu4', nn.ReLU())]))
          

    def forward(self, obs):
        return self.linear(self.cnn(obs))


In [None]:
scenarios = {0: "academy_empty_goal_close",            # academy_difficulty = 0.6
             1: "academy_empty_goal",
             2: "academy_run_to_score",
             3: "academy_run_to_score_with_keeper",
             4: "academy_pass_and_shoot_with_keeper",
             5: "academy_run_pass_and_shoot_with_keeper",
             6: "academy_3_vs_1_with_keeper",
             7: "academy_corner",
             8: "academy_counterattack_easy",
             9: "academy_counterattack_hard",
             10: "academy_single_goal_versus_lazy",
             11: "11_vs_11_easy_stochastic",         #difficulty: 0.05
             12: "11_vs_11_stochastic",              #difficulty: 0.6
             13: "11_vs_11_hard_stochastic",         #difficulty: 0.95 
             14: "11_vs_11_kaggle"}                  #difficulty: 1

scenario_index = 12
if scenario_index >=0 and scenario_index <=9:
  scenario_length = 401
else:
  scenario_length = 3001
  
scenario_name = scenarios[scenario_index]
print(scenario_name)
rewards = "scoring,checkpoints"

In [None]:
def make_env(config=None, rank=0):
    def _init():
        env = FootballGym(config)
        log_file = os.path.join(".", str(rank))
        env = Monitor(env, log_file, allow_early_resets=True)
        return env
    return _init

n_envs = 1
n_steps = 512
#config={"env_name":scenario_name}
#train_env = DummyVecEnv([make_env(config, rank=i) for i in range(n_envs)])
#eval_env = DummyVecEnv([make_env(config, rank=i) for i in range(1)])
train_env = DummyVecEnv([make_env({"env_name":scenario_name, "rewards": rewards})])
eval_env = VecTransposeImage(DummyVecEnv([make_env({"env_name":scenario_name, "rewards": "scoring"})]))
#train_env = SubprocVecEnv([make_env(config, rank=i)])
#eval_env = SubprocVecEnv([make_env(config, rank=i)])

policy_kwargs = dict(features_extractor_class=FootballCNN,
                     features_extractor_kwargs=dict(features_dim=256))

In [None]:
#Starting training for the first time
PPO = PPO(CnnPolicy(features_extractor_kwargs=dict(features_dim=256)), train_env, 
             policy_kwargs=policy_kwargs, 
             learning_rate=0.0003, 
             n_steps=n_steps, 
             batch_size=64, 
             n_epochs=2, 
             gamma=0.99,
             gae_lambda=0.95,
             clip_range=0.1, 
             ent_coef=0.001, 
             vf_coef=0.5, 
             max_grad_norm=0.64, 
             verbose=1,
             seed = 0,
             tensorboard_log ="./tensorboard/")

In [None]:
#Continue training
obs=train_env.reset()
model = PPO.load("../input/ppo-xception/models/best_model.zip",env=train_env,verbose=1)
#model.set_random_seed(seed=0)


In [None]:
%%time
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

eval_freq = scenario_length*20
eval_callback = EvalCallback(eval_env=eval_env, best_model_save_path='./models/',
                             log_path='./logs/', eval_freq=eval_freq, n_eval_episodes = 5,
                             deterministic=True, render=False, verbose=1)

checkpoint_callback = CheckpointCallback(save_freq=500000, save_path='./',
                                         name_prefix='ppo')

total_timesteps = scenario_length*15000

In [None]:
model.learn(total_timesteps=total_timesteps, callback=[eval_callback,checkpoint_callback], reset_num_timesteps=False)

In [None]:
model.save("./ppo_5219757_steps")

In [None]:
#For Kaggle
!tar -zcvf ppo_academy_3_2.tar.gz ./

from IPython.display import FileLink
FileLink(r'ppo_academy_3_2.tar.gz')

In [None]:
#If we are loading the model from previous runs and running evaluation code
model = PPO.load("./models/best_model",env=eval_env,verbose=1)

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=5, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=5, deterministic=False)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
%reload_ext tensorboard
%tensorboard --logdir "./tensorboard/"