In [1]:
import numpy as np
import pandas as pd
import retro
from gym import Env
from gym.spaces import MultiBinary, Box
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import cv2
import matplotlib.pyplot as plt
import os
import time
import retrowrapper

! rm -rf logs opt train train_logs
! mkdir logs opt train train_logs

In [2]:
! ls -R

.:
environment.ipynb  logs  opt  train  train_logs

./logs:

./opt:

./train:

./train_logs:


In [3]:
# env.observation_space.shape

In [4]:
# env.action_space

In [5]:
# info

In [6]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis',
                               record='./logs/',
#                                state=sts[0],
                               use_restricted_actions=retro.Actions.FILTERED,
#                               scenario='scenario'
                              )

    def get_reward(self, info, reward=None): return info['score'] - self.score

    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = self.get_reward(info)
        self.score = info['score']
        return frame_delta, reward, done, info

    def render(self, *args, **kwargs):
        self.game.render(*args, **kwargs)

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score = 0
        return obs

    def close(self):
        self.game.close()

    def preprocess(self, obs):
        gray = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        return np.reshape(resized, (84, 84, 1))

In [7]:
# env = StreetFighter()

In [8]:
# obs = env.reset()
# done = False
# for game in range(1):
#     # one game
#     while not done:
#         action = env.action_space.sample()
#         # env.render()
#         obs, reward, done, info = env.step(action)
#         # time.sleep(0.01)
#         if reward: print(reward)
# env.close()

In [9]:
# obs = env.reset()


In [10]:
# obs, reward, done, info = env.step(env.action_space.sample())
# plt.imshow(obs)

In [11]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [12]:
def optimize_ppo(trial):
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [13]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [14]:

def optimize_agent(trial):
    # try:
    model_params = optimize_ppo(trial)

    # Create environment
    env = StreetFighter()
    env = Monitor(env, LOG_DIR)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')

    # Create algo
    model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
    model.learn(total_timesteps=30000)
    #model.learn(total_timesteps=100000)

    # Evaluate model
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    env.close()

    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)

    return mean_reward

    # except Exception as e:
    #     return -1000

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=1, n_jobs=1)

[32m[I 2022-04-09 02:08:29,258][0m A new study created in memory with name: no-name-856e7a56-ae1f-4911-bcd8-9852a9b17b89[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2866 and n_envs=1)
[32m[I 2022-04-09 02:10:23,713][0m Trial 0 finished with value: 2200.0 and parameters: {'n_steps': 2866, 'gamma': 0.8128228093092051, 'learning_rate': 3.537559798600158e-05, 'clip_range': 0.331356631668137, 'gae_lambda': 0.852846746915222}. Best is trial 0 with value: 2200.0.[0m


In [16]:
study.best_params

{'n_steps': 2866,
 'gamma': 0.8128228093092051,
 'learning_rate': 3.537559798600158e-05,
 'clip_range': 0.331356631668137,
 'gae_lambda': 0.852846746915222}

In [17]:
study.best_trial

FrozenTrial(number=0, values=[2200.0], datetime_start=datetime.datetime(2022, 4, 9, 2, 8, 29, 260304), datetime_complete=datetime.datetime(2022, 4, 9, 2, 10, 23, 713144), params={'n_steps': 2866, 'gamma': 0.8128228093092051, 'learning_rate': 3.537559798600158e-05, 'clip_range': 0.331356631668137, 'gae_lambda': 0.852846746915222}, distributions={'n_steps': IntUniformDistribution(high=8192, low=2048, step=1), 'gamma': LogUniformDistribution(high=0.9999, low=0.8), 'learning_rate': LogUniformDistribution(high=0.0001, low=1e-05), 'clip_range': UniformDistribution(high=0.4, low=0.1), 'gae_lambda': UniformDistribution(high=0.99, low=0.8)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

In [18]:
study.best_trial.number

0

In [19]:
path = os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(study.best_trial.number))
print(path)
model = PPO.load(path)

./opt/trial_0_best_model.zip


In [20]:
from stable_baselines3.common.callbacks import BaseCallback

In [21]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [22]:
CHECKPOINT_DIR = './train/'
TRAIN_LOG_DIR = './train_logs/'
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [23]:

env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [24]:
model_params = study.best_params
model_params['n_steps'] = 7488  # set n_steps to 7488 or a factor of 64
# model_params['learning_rate'] = 5e-7
model_params

{'n_steps': 7488,
 'gamma': 0.8128228093092051,
 'learning_rate': 3.537559798600158e-05,
 'clip_range': 0.331356631668137,
 'gae_lambda': 0.852846746915222}

In [25]:
model = PPO('CnnPolicy', env, tensorboard_log=TRAIN_LOG_DIR, verbose=1, **model_params)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [26]:
model.load(path)

<stable_baselines3.ppo.ppo.PPO at 0x7eff43a927f0>

In [27]:
model.learn(total_timesteps=30000, callback=callback)

Logging to ./train_logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 792  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 7488 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.16e+04    |
|    ep_rew_mean          | 3.17e+04    |
| time/                   |             |
|    fps                  | 589         |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 14976       |
| train/                  |             |
|    approx_kl            | 0.013321469 |
|    clip_fraction        | 0.0613      |
|    clip_range           | 0.331       |
|    entropy_loss         | -8.3        |
|    explained_variance   | 0.000121    |
|    learning_rate        | 3.54e-05    |
|    loss                 | 1.56e+05    |
|    n_updates            | 10          |
|   

<stable_baselines3.ppo.ppo.PPO at 0x7eff43aa9c70>

In [28]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=1)

In [29]:
mean_reward

2100.0

In [30]:
obs = env.reset()

In [31]:
obs.shape

(1, 84, 84, 4)

In [32]:
env.step(model.predict(obs)[0])

(array([[[[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  33,   0],
          ...,
          [  0,   0,  37,   0],
          [  0,   0,  34,   0],
          [  0,   0,  36,   0]],
 
         [[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  35,   0],
          ...,
          [  0,   0,  36,   0],
          [  0,   0,  40,   0],
          [  0,   0,  40,   0]],
 
         [[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  36,   0],
          ...,
          [  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  36,   0]],
 
         ...,
 
         [[  0,   0, 162,   0],
          [  0,   0, 159,   0],
          [  0,   0, 159,   0],
          ...,
          [  0,   0, 159,   0],
          [  0,   0, 159,   0],
          [  0,   0, 159,   0]],
 
         [[  0,   0, 162,   0],
          [  0,   0, 162,   0],
          [  0,   0, 162,   0],
          ...,
          [  0,   0, 162,   0],
 

In [33]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1):
    while not done:
        if done:
            obs = env.reset()
#         env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        if reward: print(reward)

[100.]
[1000.]
[500.]
[1000.]
[500.]
[100.]
[1000.]
[100.]
[500.]
[100.]
[1000.]
[100.]
[300.]
[500.]
[1000.]
[500.]
[100.]
[1000.]
[1000.]
[1000.]
[1000.]
[500.]
[1000.]
[500.]
[100.]
[100.]
[100.]
[1000.]
