In [None]:
import os
import shutil
import time

from environment import StreetFighterEnv
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

import optuna
from model import policy_kwargs
import warnings

warnings.simplefilter("ignore")

In [None]:
def optimize_ppo(trial):
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [None]:
BASE_DIR = './PPO_without_attention_frame_4/'
LOG_DIR = '{}logs/'.format(BASE_DIR)
OPT_DIR = '{}opt/'.format(BASE_DIR)
CHECKPOINT_DIR = '{}train/'.format(BASE_DIR)
TRAIN_LOG_DIR = '{}train_logs/'.format(BASE_DIR)
RECORD_DIR = '{}record/'.format(BASE_DIR)

for i in [BASE_DIR, LOG_DIR, OPT_DIR, CHECKPOINT_DIR, TRAIN_LOG_DIR, RECORD_DIR]:

    shutil.rmtree(i, ignore_errors=True)
    os.mkdir(i)
    
    
def optimize_agent(trial):
    mean_reward = 0.
    try:
        model_params = optimize_ppo(trial)

        print(model_params)
        env = StreetFighterEnv(RECORD_DIR)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        model = PPO("CnnPolicy", env, verbose=0, tensorboard_log=LOG_DIR, policy_kwargs=policy_kwargs, **model_params)

        model.learn(total_timesteps=100000)
        
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        
    except:
        pass
    print(mean_reward)
    return mean_reward

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=20, n_jobs=1)


In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_contour(study, params=['gamma', 'learning_rate', 'n_steps'])

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
study.best_params

In [None]:
path = os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(study.best_trial.number))
print(path)
model = PPO.load(path)

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:

callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [None]:
# for i in [LOG_DIR, CHECKPOINT_DIR, TRAIN_LOG_DIR, RECORD_DIR]:
        
#     shutil.rmtree(i)
#     os.mkdir(i)
    
model_params = study.best_params
model_params['n_steps'] = 7488

env = StreetFighterEnv(RECORD_DIR)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=LOG_DIR, policy_kwargs=policy_kwargs, **model_params)


In [None]:
model.load(path)

In [None]:
model.learn(total_timesteps=30000, callback=callback)

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=5)
mean_reward

In [None]:
model.save(BASE_DIR + 'final_model')

In [None]:
# obs = env.reset()
# obs.shape

In [None]:
# env.step(model.predict(obs)[0])

In [None]:
# # Reset game to starting state
# obs = env.reset()
# # Set flag to flase
# done = False
# for game in range(1):
#     while not done:
#         if done:
#             obs = env.reset()
#         env.render()
#         action = model.predict(obs)[0]
#         obs, reward, done, info = env.step(action)
#         time.sleep(0.01)
#         if reward: print(reward)