# sample code

In [None]:
import gym
import numpy as np
import cv2
import retro
import torch
import torch.nn as nn
import torch.nn.functional as F
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env

class StreetFighterEnv(gym.Env):
    def __init__(self):
        super(StreetFighterEnv, self).__init__()
        self.env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')
        self.action_space = self.env.action_space
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 4), dtype=np.uint8)
        self.current_state = None

    def preprocess(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
        return frame[:, :, None]

    def reset(self):
        self.current_state = self.preprocess(self.env.reset())
        return np.concatenate([self.current_state] * 4, axis=-1)

    def step(self, action):
        total_reward = 0
        obs, reward, done, _ = self.env.step(action)
        total_reward += reward
        self.current_state = self.preprocess(obs)
        next_state = np.concatenate([self.current_state] * 4, axis=-1)
        return next_state, total_reward, done, {}

    def render(self, mode='human'):
        return self.env.render(mode)

    def close(self):
        self.env.close()

class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

def train_ppo():
    env = DummyVecEnv([lambda: StreetFighterEnv()])

    policy_kwargs = dict(
        features_extractor_class=CustomCNN,
        features_extractor_kwargs=dict(features_dim=512),
    )

    model = PPO('CnnPolicy', env, policy_kwargs=policy_kwargs, verbose=1)

    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./models/', name_prefix='ppo_model')
    eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False)

    model.learn(total_timesteps=int(1e6), callback=[checkpoint_callback, eval_callback])

def test_ppo():
    env = StreetFighterEnv()
    model = PPO.load('models/ppo_model')

    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()

    env.close()

if __name__ == "__main__":
    train_ppo()
    test_ppo()


# experiments

In [35]:
env = StreetFighterEnv()

In [36]:
obs = env.reset()

In [37]:
obs.shape

(3, 100, 128)

# Environment Setup

In [1]:
import gym
import numpy as np
import cv2
import time
import retro
import torch
import torch.nn as nn
import torch.nn.functional as F
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv,VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

class StreetFighterEnv(gym.Env):
    def __init__(self, render_mode=None):
        super(StreetFighterEnv, self).__init__()
        self.env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis',use_restricted_actions=retro.Actions.FILTERED)
        self.action_space = self.env.action_space
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(1,100,128), dtype=np.uint8)
        self.current_state = None
        self.render_mode = render_mode
        self.score = 0;
        self.health = 176;

    def preprocess(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (128,100), interpolation=cv2.INTER_AREA)
        frame = np.reshape(frame,(1,100,128))
        return frame

    def reset(self):
        obs = self.env.reset()
        self.health = 176
        self.score = 0
        self.current_state = self.preprocess(obs)
        # return np.concatenate([self.current_state] * 3, axis=0)
        return self.current_state

    def step(self, action):
        total_reward = 0
        obs, reward, done, info = self.env.step(action)
        score = info['score']-self.score
        health = self.health-info['health']
        self.score = info['score']
        self.health = info['health']
        total_reward += score
        total_reward -= health*10
        
        self.current_state = self.preprocess(obs)
        # next_state = np.concatenate([self.current_state] * 3, axis=0)
        return self.current_state, total_reward, done, info

    def render(self,*args,**kwargs):
        if self.render_mode == 'human':
            self.env.render(mode='human')

    def close(self):
        self.env.close()

In [38]:
env.close()

# Training

In [None]:
env = DummyVecEnv([lambda: StreetFighterEnv(render_mode='human')])
env = VecFrameStack(env,n_stack = 3,channels_order = 'last')
model = PPO('CnnPolicy', env, verbose=1)

checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./models/', name_prefix='ppo_model')
eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False)

model.learn(total_timesteps=int(1e6), callback=[checkpoint_callback, eval_callback])

# Testing

In [2]:
env = StreetFighterEnv(render_mode = 'human')
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack = 3,channels_order = 'first')



In [3]:
env.reset().shape

(1, 3, 100, 128)

In [10]:
env.close()

In [42]:
env = StreetFighterEnv(render_mode='human')

In [4]:
model2=PPO.load('saved_models\ppo_ryu_2000000_steps_updated.zip')

Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from 'c:\\Users\\mvswa\\reinforcement\\.venv\\lib\\site-packages\\cloudpickle\\cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from 'c:\\Users\\mvswa\\reinforcement\\.venv\\lib\\site-packages\\cloudpickle\\cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from 'c:\\Users\\mvswa\\reinforcement\\.venv\\lib\\site-packages\\cloudpickle\\cloudpickle.py'>


In [5]:
model = PPO.load('saved_models\\best_model_5460000.zip')

Exception: an integer is required (got type bytes)
Exception: an integer is required (got type bytes)
	Missing key(s) in state_dict: "pi_features_extractor.cnn.0.weight", "pi_features_extractor.cnn.0.bias", "pi_features_extractor.cnn.2.weight", "pi_features_extractor.cnn.2.bias", "pi_features_extractor.cnn.4.weight", "pi_features_extractor.cnn.4.bias", "pi_features_extractor.linear.0.weight", "pi_features_extractor.linear.0.bias", "vf_features_extractor.cnn.0.weight", "vf_features_extractor.cnn.0.bias", "vf_features_extractor.cnn.2.weight", "vf_features_extractor.cnn.2.bias", "vf_features_extractor.cnn.4.weight", "vf_features_extractor.cnn.4.bias", "vf_features_extractor.linear.0.weight", "vf_features_extractor.linear.0.bias".  


In [41]:
env.close()

In [7]:
for episode in range(1): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model2.predict(obs)
        obs, reward, done, info = env.step(action)
        env.venv.envs[0].render()
        time.sleep(0.001)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode , total_reward))
    time.sleep(2)

Total Reward for episode 0 is [20540.]


: 