In [1]:
import StreetFighter as sf

In [2]:
!python3 -m retro.import ./roms # Run this from the roms folder, or where you have your game roms 

Imported 0 games


In [3]:
import optuna
import json
# stable baseline
from stable_baselines3 import A2C, PPO, DQN
from sb3_contrib import RecurrentPPO, TRPO, QRDQN
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
kLearnTimesteps = 150_000

In [5]:
def optimize_a2c_agent(trial):
    
    model_params = {
        "n_steps": trial.suggest_int("n_steps", 16, 128),
        "gamma": trial.suggest_float("gamma", 0.8, 0.9999),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3),
        "ent_coef": trial.suggest_float("ent_coef", 1e-4, 1e-1),
        "vf_coef": trial.suggest_float("vf_coef", 0.01, 0.5),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 1)
    }
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = A2C('MlpPolicy', env, tensorboard_log=kLogDir, verbose=0, **model_params)
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [6]:

def optimize_ppo_agent(trial):
    model_params =  {
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
        "n_steps": trial.suggest_categorical("n_steps", [64, 128, 256, 512, 1024, 2048, 4096, 8192]),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, .99)
    }
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = PPO('MlpPolicy', env, tensorboard_log=kLogDir, verbose=0, **model_params)
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [7]:

def optimize_dqn_agent(trial):
    model_params =  {
        "batch_size": trial.suggest_categorical("batch_size", [ 32, 64, 128, 256, 512, 1024]),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3),
        "learning_starts" : trial.suggest_categorical("learning_starts", [1000, 10000, 20000, 30000, 40000, 50000]),
        "train_freq" : trial.suggest_categorical("train_freq", [128, 256]), # Train time
        "gradient_steps" : trial.suggest_int("gradient_steps", 1, 256),
        "exploration_final_eps" : trial.suggest_float("exploration_final_eps", 0.01, 0.09),
        "exploration_fraction" : trial.suggest_float("exploration_fraction", 0.1, 0.5),
        
    }
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = DQN('MlpPolicy', env, tensorboard_log=kLogDir, verbose=0, **model_params)
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [8]:
def optimize_rppo_agent(trial):
    model_params = {
        "batch_size": trial.suggest_categorical("batch_size", [128, 256]),
        "n_steps": trial.suggest_categorical("n_steps", [128, 256]), 
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, .99),
        "ent_coef": trial.suggest_float("ent_coef", 1e-4, 1e-1),
        "vf_coef": trial.suggest_float("vf_coef", 0.01, 0.5),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 1)
    }
    
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = RecurrentPPO('MlpLstmPolicy', env, tensorboard_log=kLogDir, verbose=0, **model_params )
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [9]:
def optimize_trpo_agent(trial):
    model_params =  {
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
        "n_steps": trial.suggest_categorical("n_steps", [64, 128, 256, 512, 1024, 2048, 4096, 8192]),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, .99),
        "n_critic_updates": trial.suggest_categorical("n_critic_updates", [5, 10, 20, 25, 30]),
        "cg_max_steps": trial.suggest_categorical("cg_max_steps", [5, 10, 20, 25, 30]),
        "target_kl": trial.suggest_categorical("target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001])
        
    }
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = TRPO("MlpPolicy", env, verbose=0, tensorboard_log=kLogDir, **model_params )
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [10]:
def optimize_qrdqn_agent(trial):
    model_params =  {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        "learning_starts" : trial.suggest_categorical("learning_starts", [1000, 10000, 20000, 30000, 40000, 50000]),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999),
        "train_freq" : trial.suggest_categorical("train_freq", [4, 8, 16]),
        "gradient_steps" : trial.suggest_int("gradient_steps", 1, 8),
        "exploration_final_eps" : trial.suggest_uniform("exploration_final_eps", 0.01, 0.09),
        "exploration_fraction" : trial.suggest_uniform("exploration_fraction", 0.1, 0.5)
    }
    env = sf.CreateEnv( 'L4_Ryu_Guile', 1, kLogDir, 6 )
    model = QRDQN("MlpPolicy", env, verbose=0, tensorboard_log=kLogDir, **model_params )
    model.learn(total_timesteps=kLearnTimesteps)
    mean_reward, _ = evaluate_policy(model, env)
    env.close()
    return mean_reward

In [7]:
kLogDir = './logs_A2C_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_a2c_agent, n_trials=20)
study.best_params

[32m[I 2023-02-05 16:55:31,917][0m A new study created in memory with name: no-name-fa7637e3-3a96-4e5d-81f9-f2e727dbaa09[0m
2023-02-05 16:55:34.627069: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-05 16:55:34.713129: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-05 16:55:34.735391: E tensorflow/tsl/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/core/bfc_allocator_delay
[32m[I 2023-02-05 17:06:45,881][0m Trial 0 finished with

{'n_steps': 32,
 'gamma': 0.9219513462075898,
 'learning_rate': 0.0008247862823389501,
 'ent_coef': 0.05764738718774189,
 'vf_coef': 0.24613796230823268,
 'max_grad_norm': 0.8379923618752754}

In [11]:
with open('A2C.json', 'w') as outfile:
    json.dump(study.best_params, outfile)
    

In [12]:
kLogDir = './logs_PPO_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_ppo_agent, n_trials=20)
study.best_params

[32m[I 2023-02-05 20:38:50,595][0m A new study created in memory with name: no-name-a7d80cb5-5610-4758-a178-a2ae6062d4e7[0m
[32m[I 2023-02-05 20:50:33,258][0m Trial 0 finished with value: -194.0 and parameters: {'batch_size': 128, 'n_steps': 4096, 'gamma': 0.8988542729977321, 'learning_rate': 3.9329135162204456e-05, 'clip_range': 0.24155717161264847, 'gae_lambda': 0.8439854700634416}. Best is trial 0 with value: -194.0.[0m
[32m[I 2023-02-05 21:03:37,564][0m Trial 1 finished with value: -107.88 and parameters: {'batch_size': 64, 'n_steps': 128, 'gamma': 0.9609610335365972, 'learning_rate': 0.0004045201186876571, 'clip_range': 0.3525024480214578, 'gae_lambda': 0.9553970353438237}. Best is trial 1 with value: -107.88.[0m
[32m[I 2023-02-05 21:53:23,480][0m Trial 2 finished with value: -254.0 and parameters: {'batch_size': 8, 'n_steps': 1024, 'gamma': 0.8020970062275904, 'learning_rate': 0.00011946742810732661, 'clip_range': 0.18165373674237656, 'gae_lambda': 0.8497358896158252}.

{'batch_size': 64,
 'n_steps': 128,
 'gamma': 0.9609610335365972,
 'learning_rate': 0.0004045201186876571,
 'clip_range': 0.3525024480214578,
 'gae_lambda': 0.9553970353438237}

In [14]:
with open('PPO.json', 'w') as outfile:
    json.dump(study.best_params, outfile)

In [15]:
kLogDir = './logs_QRDQN_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_qrdqn_agent, n_trials=20)
study.best_params

[32m[I 2023-02-07 18:01:24,200][0m A new study created in memory with name: no-name-fdb8bbe9-4b96-4285-9dd5-33fb2ff93bdf[0m
2023-02-07 18:01:25.480777: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-07 18:01:25.556269: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-07 18:01:25.578741: E tensorflow/tsl/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/core/bfc_allocator_delay
[32m[I 2023-02-07 18:10:51,796][0m Trial 0 finished with

{'learning_rate': 5.8778983576095745e-05,
 'learning_starts': 30000,
 'batch_size': 512,
 'gamma': 0.8725607763842587,
 'train_freq': 8,
 'gradient_steps': 1,
 'exploration_final_eps': 0.06999189354271829,
 'exploration_fraction': 0.34412108119160434}

In [17]:
with open('QRDQN.json', 'w') as outfile:
    json.dump(study.best_params, outfile)

In [18]:
kLogDir = './logs_RPPO_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_rppo_agent, n_trials=20)
study.best_params

[32m[I 2023-02-08 00:04:48,969][0m A new study created in memory with name: no-name-c5c8f0d9-882a-4854-9da7-70f104abb6f2[0m
2023-02-08 00:04:51.071391: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-08 00:04:51.237972: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-08 00:04:51.283024: E tensorflow/tsl/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/core/bfc_allocator_delay
[32m[I 2023-02-08 00:39:08,403][0m Trial 0 finished with

{'batch_size': 128,
 'n_steps': 256,
 'gamma': 0.9560821301100058,
 'learning_rate': 0.0009697975901111305,
 'clip_range': 0.37281645364831995,
 'gae_lambda': 0.9043651572782224,
 'ent_coef': 0.00727905612251794,
 'vf_coef': 0.49282068512883936,
 'max_grad_norm': 0.7994437708612409}

In [20]:
with open('RPPO.json', 'w') as outfile:
    json.dump(study.best_params, outfile)

In [21]:
kLogDir = './logs_TRPO_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_trpo_agent, n_trials=20)
study.best_params

[32m[I 2023-02-12 13:44:05,172][0m A new study created in memory with name: no-name-641490f8-88cb-45e0-83a4-3e8bcfc3afa7[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
2023-02-12 13:44:06.200701: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-12 13:44:06.287236: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-12 13:44:06.313427: E tensorflow/tsl/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/core/bfc_

{'batch_size': 256,
 'n_steps': 256,
 'gamma': 0.9203938773776468,
 'learning_rate': 3.498635674066087e-05,
 'gae_lambda': 0.8277127959450231,
 'n_critic_updates': 30,
 'cg_max_steps': 20,
 'target_kl': 0.03}

In [23]:
with open('TRPO.json', 'w') as outfile:
    json.dump(study.best_params, outfile)

In [24]:
kLogDir = './logs_DQN_OP'
study = optuna.create_study(direction='maximize')
study.optimize(optimize_dqn_agent, n_trials=20)
study.best_params

[32m[I 2023-02-12 21:55:35,941][0m A new study created in memory with name: no-name-e4bc4e00-b1b2-4240-8915-d444e9c8f9bb[0m
[32m[I 2023-02-12 22:07:04,162][0m Trial 0 finished with value: -242.0 and parameters: {'batch_size': 128, 'gamma': 0.9451145877858746, 'learning_rate': 0.0004383452677909998, 'learning_starts': 40000, 'train_freq': 256, 'gradient_steps': 163, 'exploration_final_eps': 0.013566620533055912, 'exploration_fraction': 0.1910647685409276}. Best is trial 0 with value: -242.0.[0m
[32m[I 2023-02-12 22:20:54,373][0m Trial 1 finished with value: -224.0 and parameters: {'batch_size': 1024, 'gamma': 0.9773401278300756, 'learning_rate': 0.00021919592900047344, 'learning_starts': 40000, 'train_freq': 256, 'gradient_steps': 46, 'exploration_final_eps': 0.06386208813128622, 'exploration_fraction': 0.21014339423128578}. Best is trial 1 with value: -224.0.[0m
[32m[I 2023-02-12 22:37:11,321][0m Trial 2 finished with value: 118.59999999999998 and parameters: {'batch_size': 

{'batch_size': 64,
 'gamma': 0.8227895970029786,
 'learning_rate': 0.00018734229803741371,
 'learning_starts': 1000,
 'train_freq': 128,
 'gradient_steps': 146,
 'exploration_final_eps': 0.03317328152571017,
 'exploration_fraction': 0.14405883869380742}

In [26]:
with open('DQN.json', 'w') as outfile:
    json.dump(study.best_params, outfile)