In [1]:
# Connect google drive to Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install the environment
!pip install gym==0.21.0 gym-retro
!pip install opencv-python
!pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install stable-baselines3[extra] optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym==0.21.0
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 14.9 MB/s 
[?25hCollecting gym-retro
  Downloading gym_retro-0.8.0-cp38-cp38-manylinux1_x86_64.whl (161.9 MB)
[K     |████████████████████████████████| 161.9 MB 65 kB/s 
Collecting pyglet==1.*,>=1.3.2
  Downloading pyglet-1.5.27-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 62.1 MB/s 
[?25hBuilding wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.21.0-py3-none-any.whl size=1616824 sha256=3e017f9a90ee5b153b667ca8d91a0d58eb11fcc2ffa0ef241b7f0d3e8d8d76d0
  Stored in directory: /root/.cache/pip/wheels/27/6d/b3/a3a6e10704795c9b9000f1ab2dc480dfe7bed42f5972806e73
Successfully built gym
Installing collected packages: pyglet, gym, gym-retro
  Attempting uninstall: gym
   

In [3]:
# Import the externel module
import numpy as np
import cv2
import os
import retro
import time
import optuna
from gym import Env 
from gym.spaces import MultiBinary, Box
# PPO -> Reinforcement Learning Model we will use
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
# from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
# from stable_baselines import PPO2, A2C
from matplotlib import pyplot as plt

In [4]:
class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Observation_space size reduction : 200 x 256 x 3 -> 84 x 84 x 1
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        # Stop(1) / Move(5) : Left, Right, Block, Jump, Crouch / Attack(6) : Punch level x3, Kick level x3
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def preprocess(self, obs): 
        obs_gray = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
        obs_channels = np.reshape(cv2.resize(obs_gray, (84,84), interpolation=cv2.INTER_CUBIC), (84,84,1))
        return obs_channels 

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        self.score = 0
        return obs
    
    def step(self, action): 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = info['score'] - self.score 
        self.score = info['score'] 
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

In [5]:
class Callback(BaseCallback):

    def __init__(self, period, save_dir, verbose=1):
        super(Callback, self).__init__(verbose)
        self.period = period
        self.save_dir = save_dir

    def _init_callback(self):
        if os.path.isdir(self.save_dir) == False:
            os.makedirs(self.save_dir)
        # if self.save_dir is not None:
        #     os.makedirs(self.save_dir, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.period == 0:
            self.model.save(os.path.join(self.save_dir, 'best_model_{}'.format(self.n_calls)))

        return True

In [6]:
def HypParam_PPO(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [17]:
# Create the Model to optization from Optuna
def PPO_agent(trial):
    model_params = HypParam_PPO(trial) 

    env = StreetFighter()
    env = Monitor(env, LOG_DIR)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')

    model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
    model.learn(total_timesteps=30000)

    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    env.close()

    model.save(os.path.join(OPT_DIR, 'No.{}_best_model'.format(trial.number)))

    return mean_reward

In [18]:
# Retro import the Street Fighter ROM file
!python -m retro.import "/content/drive/MyDrive/RL"

Importing StreetFighterIISpecialChampionEdition-Genesis
Imported 1 games


In [19]:
# Define the Directories
LOG_DIR = '/content/drive/MyDrive/RL/logs/'
OPT_DIR = '/content/drive/MyDrive/RL/opt/'
SAVE_PATH = os.path.join(OPT_DIR, 'No.{}_best_model'.format(1))
CHECKPOINT_DIR = './train/'

In [22]:
# Creating the experiment and Execute the optimization
study = optuna.create_study(direction='maximize')
study.optimize(PPO_agent, n_trials=10, n_jobs=1)

[32m[I 2022-12-23 12:06:18,416][0m A new study created in memory with name: no-name-e319bbed-a07d-4553-9d37-811e095d7c64[0m
  'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
  'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
  'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4761 and n_envs=1)
[32m[I 2022-12-23 12:10:06,873][0m Trial 0 finished with value: 0.0 and parameters: {'n_steps': 4761, 'gamma': 0.9380136088650449, 'learning_rate': 9.911324242139609e-05, 'clip_range': 0.3196227652016298, 'gae_lambda': 0.9619233930716062}. Best is trial 0 with value: 0.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2127 and n_envs=1)
[32m[I 2022-12-23 12:14:03,862][0m Trial 1 finished with value: 0.0 and parameters: {'n_steps': 2127, 'gamma': 0.82510112

In [28]:
# Tunning the hyperparameters
print(study.best_params)
model = PPO.load(os.path.join(OPT_DIR, 'No.8_best_model'))
callback = Callback(period=10000, save_dir=CHECKPOINT_DIR)

{'n_steps': 7613, 'gamma': 0.906305450231301, 'learning_rate': 1.590696426359432e-05, 'clip_range': 0.3653015121517822, 'gae_lambda': 0.9047694783029425}


In [29]:
env = StreetFighter()

RuntimeError: ignored

In [25]:
#Training the model with Best hyperparameters
model_params = study.best_params
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
model.load(os.path.join(OPT_DIR, 'No.8_best_model'))
model.learn(total_timesteps=100000, callback=callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to /content/drive/MyDrive/RL/logs/PPO_12
-----------------------------
| time/              |      |
|    fps             | 399  |
|    iterations      | 1    |
|    time_elapsed    | 19   |
|    total_timesteps | 7613 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.11e+04    |
|    ep_rew_mean          | 3.51e+04    |
| time/                   |             |
|    fps                  | 319         |
|    iterations           | 2           |
|    time_elapsed         | 47          |
|    total_timesteps      | 15226       |
| train/                  |             |
|    approx_kl            | 0.017843114 |
|    clip_fraction        | 0.0207      |
|    clip_range           | 0.365       |
|    entropy_loss         | -8.31       |
|    explained_va

<stable_baselines3.ppo.ppo.PPO at 0x7f78d813f1f0>