# Street Fighter No Delta
This notebook shows how to create the environment without the delta transformation. 

# Setup StreetFighter Dependencies

In [None]:
%pip install gym==0.21.0 gym-retro==0.8.0 stable_baselines3==2.3.2 opencv-python==4.10.0.82 numpy==1.24.3

In [None]:
%pip install setuptools==65.5.0 "wheel<0.40.0"

In [None]:
# this command is used to sign in or import the downloaded roms into retro library
# !python -m retro.import .
# execute this in command prompt , makesure you are in the directory where roms are stored 

In [None]:
# link for downloading the streetfighter rom
# https://wowroms.com/en/roms/sega-genesis-megadrive/download-street-fighter-ii-special-champion-edition-europe/26496.html

# Setup Environment

In [1]:
from gym import Env
import gym
from gym.spaces import Box, MultiBinary
import numpy as np
import cv2
import time
import retro

In [2]:
class StreetFighter(Env):
    def __init__(self,render_mode=None):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
        self.render_mode = render_mode
        self.score = 0
        # self.previous_frame = 0
    
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # Preprocess frame from game
        frame_delta = obs #- self.previous_frame
        # self.previous_frame = obs 
        
        # Shape reward
        reward = info['score'] - self.score 
        self.score = info['score']
        reward = reward - (self.agent_health-info['health'])*100
        self.agent_health = info['health']
        if info['matches_won'] != self.agent_matches_won:
            reward = reward+(info['matches_won']-self.agent_matches_won)*2000
            self.agent_matches_won = info['matches_won']
            
        if info['enemy_matches_won'] != self.enemy_matches_won:
            reward = reward - (info['enemy_matches_won'] - self.enemy_matches_won)*4000
            self.enemy_matches_won = info['enemy_matches_won']

        return frame_delta, reward, done, info 
    
    def render(self, *args, **kwargs):
        if self.render_mode == 'human':
            self.game.render(mode='human')
    
    def reset(self):
        # self.previous_frame = np.zeros(self.game.observation_space.shape)
        
        # Frame delta
        obs = self.game.reset()
        obs = self.preprocess(obs)
        # self.previous_frame = obs
        
        # Create initial variables
        self.score = 0
        self.agent_health = 176
        self.enemy_matches_won = 0
        self.agent_matches_won = 0
        
        return obs
    
    def preprocess(self, observation): 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (84,84,1))
        return state
    
    def close(self): 
        self.game.close()

In [3]:
env = StreetFighter(render_mode='human')

In [5]:
obs = env.reset()

In [6]:
obs,reward,done,info=env.step(env.action_space.sample())

In [7]:
info.items()

dict_items([('enemy_matches_won', 0), ('score', 0), ('matches_won', 0), ('continuetimer', 0), ('enemy_health', 176), ('health', 176)])

In [8]:
env.close()

In [None]:
obs = env.reset()
done = False
for game in range(5):
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        time.sleep(0.01)
        obs, reward, done, info = env.step(env.action_space.sample())
        print(reward)

In [11]:
env.close()

In [55]:
env.observation_space.shape

(84, 84, 1)

In [3]:
# Import optuna for HPO
import optuna
# Import PPO for algos
from stable_baselines3 import PPO
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import wrappers
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

  from .autonotebook import tqdm as notebook_tqdm


# Hyperparameter tune

In [8]:
# %pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [9]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
Successfully installed typing-extensions-4.9.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.


In [10]:
%pip install stable-baselines3[extra] optuna

^C
Note: you may need to restart the kernel to use updated packages.


Collecting ale-py~=0.8.1 (from shimmy[atari]~=1.3.0; extra == "extra"->stable-baselines3[extra])
  Downloading ale_py-0.8.1-cp38-cp38-win_amd64.whl.metadata (8.3 kB)
Downloading ale_py-0.8.1-cp38-cp38-win_amd64.whl (952 kB)
   ---------------------------------------- 0.0/952.0 kB ? eta -:--:--
   -------- ------------------------------- 204.8/952.0 kB 4.1 MB/s eta 0:00:01
   ---------------- ----------------------- 389.1/952.0 kB 4.9 MB/s eta 0:00:01
   -------------------------- ------------- 634.9/952.0 kB 4.5 MB/s eta 0:00:01
   -------------------------- ------------- 634.9/952.0 kB 4.5 MB/s eta 0:00:01
   -------------------------------- ------- 778.2/952.0 kB 3.5 MB/s eta 0:00:01
   ---------------------------------------- 952.0/952.0 kB 3.6 MB/s eta 0:00:00
Installing collected packages: ale-py
  Attempting uninstall: ale-py
    Found existing installation: ale-py 0.7.5
    Uninstalling ale-py-0.7.5:
      Successfully uninstalled ale-py-0.7.5
Successfully installed ale-py-0.8.1

  You can safely remove it manually.


In [4]:
# Import optuna for HPO
import optuna
# Import PPO for algos
from stable_baselines3 import PPO
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import wrappers
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [None]:
LOG_DIR = './logs/'
OPT_DIR = './opt_nodelta/'

In [None]:
# #https://github.com/araffin/rl-baselines-zoo/issues/29
def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, .99)
    }

In [None]:
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=100000)
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=20)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        return mean_reward
    except Exception as e: 
        return -1000

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=100, n_jobs=1)

# Setup Callback

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback,BaseCallback
from stable_baselines3.common.env_util import make_vec_env
import cv2
import numpy as np
import time
import os

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls+4000000))
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = './train_nodelta/'

In [7]:
LOG_DIR = './logs/'

In [9]:
callback = TrainAndLoggingCallback(check_freq=1000000, save_path=CHECKPOINT_DIR)

# Train Model

In [12]:
env = StreetFighter()
# env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')



In [11]:
env.close()

In [13]:
env.reset().shape

(1, 84, 84, 4)

In [14]:
model_params = {'n_steps': 2570.949, 'gamma': 0.906, 'learning_rate': 2e-07, 'clip_range': 0.369, 'gae_lambda': 0.891}
#model_params = {'n_steps': 8960, 'gamma': 0.906, 'learning_rate': 2e-03, 'clip_range': 0.369, 'gae_lambda': 0.891}
# model_params = study.best_params

In [15]:
model_params['n_steps'] = 40*64

In [16]:
model_params

{'n_steps': 2560,
 'gamma': 0.906,
 'learning_rate': 2e-07,
 'clip_range': 0.369,
 'gae_lambda': 0.891}

In [23]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
#model.load('./train_nodelta_backup/best_model_5460000.zip')

Using cpu device
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps=10000000, callback=callback)

In [19]:
pretrained_model = PPO.load("train_nodelta\\best_model_4000000.zip")
pretrained_model.set_env(env)
pretrained_model.learn(total_timesteps=6000000,callback=callback)

Wrapping the env in a VecTransposeImage.
Logging to ./logs/PPO_9
-----------------------------
| time/              |      |
|    fps             | 374  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2560 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 153         |
|    iterations           | 2           |
|    time_elapsed         | 33          |
|    total_timesteps      | 5120        |
| train/                  |             |
|    approx_kl            | 0.001401764 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.76       |
|    explained_variance   | 0.018       |
|    learning_rate        | 2e-07       |
|    loss                 | 8.58e+04    |
|    n_updates            | 40800       |
|    policy_gradient_loss | -0.000461   |
|    value_loss           | 3.76e+05    |
-----------

KeyboardInterrupt: 

In [136]:
env.close()

# Evaluate the Model

In [6]:
test_model = PPO.load("train_nodelta\\best_model_6000000.zip")

In [22]:
model = PPO.load("saved_models\\best_model_5460000.zip")

Exception: an integer is required (got type bytes)
Exception: an integer is required (got type bytes)
	Missing key(s) in state_dict: "pi_features_extractor.cnn.0.weight", "pi_features_extractor.cnn.0.bias", "pi_features_extractor.cnn.2.weight", "pi_features_extractor.cnn.2.bias", "pi_features_extractor.cnn.4.weight", "pi_features_extractor.cnn.4.bias", "pi_features_extractor.linear.0.weight", "pi_features_extractor.linear.0.bias", "vf_features_extractor.cnn.0.weight", "vf_features_extractor.cnn.0.bias", "vf_features_extractor.cnn.2.weight", "vf_features_extractor.cnn.2.bias", "vf_features_extractor.cnn.4.weight", "vf_features_extractor.cnn.4.bias", "vf_features_extractor.linear.0.weight", "vf_features_extractor.linear.0.bias".  


In [7]:
env = StreetFighter(render_mode='human')
# env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')



In [18]:
env.close()

In [36]:
# Use multiple vectorized environments and frame stack for better performance
# env = StreetFighter()
# env = make_vec_env(lambda: env, n_envs=4)
# env = VecFrameStack(env,n_stack = 4,channels_order = 'last')

In [18]:
env.close()

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
mean_reward

# Test out the Model

In [33]:
test_model.predict(obs)

(array([[1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.]], dtype=float32),
 None)

In [8]:
for episode in range(1): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = test_model.predict(obs)
        obs, reward, done, info = env.step(action)
        # Assume 'env' is your VecFrameStack object
        # Assume 'env' is your VecFrameStack object
        env.venv.envs[0].render()
        time.sleep(0.000001)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode , total_reward))
    time.sleep(2)

Total Reward for episode 0 is [7400.]


In [50]:
env.close()