# Soft Actor Critic with env that observe Image
Version without VAE

In [5]:
from PIL import Image

def display_frames_as_gif(frames, filename='CarRacing_SAC.gif'):
    frs = [Image.fromarray(f, mode='RGB') for f in frames]
    frs[0].save('./result/'+filename, save_all=True, append_images=frs[1:], optimize=False, duration=40, loop=0)

In [1]:
import gym
from gym import spaces
import numpy as np
import torch
from torch import distributions, nn
import pfrl
import cv2
import matplotlib.pyplot as plt

In [2]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        gym.Wrapper.__init__(self, env)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        total_reward = 0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            elif i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break

        max_frame = self._obs_buffer.max(axis=0)
        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

class WrapPyTorch(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(WrapPyTorch, self).__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = spaces.Box(
            self.observation_space.low[0,0,0],
            self.observation_space.high[0,0,0],
            [obs_shape[2], obs_shape[0], obs_shape[1]],
            dtype=np.float32
        )
    
    def observation(self, observation):
        return observation.transpose(2, 0, 1)

class WrapFrame(gym.ObservationWrapper):
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        self.observation_space = spaces.Box(low=0, high=255, 
            shape=(self.observation_space.shape[0], self.observation_space.shape[1], 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        return frame[:,:,None]

In [3]:
env = gym.make('CarRacing-v0')
env = MaxAndSkipEnv(env, skip=5)
# env = WrapFrame(env)
env = pfrl.wrappers.CastObservationToFloat32(env)
env = WrapPyTorch(env)
# env = pfrl.wrappers.NormalizeActionSpace(env)



In [4]:
timestep_limit = env.spec.max_episode_steps
obs_space = env.observation_space
action_space = env.action_space
obs_size = obs_space.low.size
action_size = action_space.low.size

print(f'timelimit: {timestep_limit}')
print(f'obs_space: {obs_space}, \naction_space: {action_space}')
print(f'obs_size: {obs_size}, \naction_size: {action_size}')
print(obs_space.shape)

timelimit: 1000
obs_space: Box(0.0, 255.0, (3, 96, 96), float32), 
action_space: Box(-1.0, 1.0, (3,), float32)
obs_size: 27648, 
action_size: 3
(3, 96, 96)


In [6]:
def conv2d_size_out(size, kernel_size=5, stride=2):
    return (size - (kernel_size - 1) - 1) // stride + 1
        
def make_conv2d_layer(width, height):
    convW = conv2d_size_out(width, 5, 2)
    convW = conv2d_size_out(convW, 5, 2)
    convW = conv2d_size_out(convW, 3, 1)

    convH = conv2d_size_out(height, 5, 2)
    convH = conv2d_size_out(convH, 5, 2)
    convH = conv2d_size_out(convH, 3, 1)

    linear_input_size = convW * convH * 64
    print(linear_input_size)

    # RGB Image tensor as input
    return nn.Sequential(
        nn.Conv2d(3, 32, kernel_size=5,stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=5, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3,stride=1),
        nn.ReLU(),
        nn.Flatten(),
    ), linear_input_size

def make_linear_layer(linear_input_size, out_size):
    return nn.Sequential(
        nn.Linear(linear_input_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, out_size),
    )

In [7]:
def squashed_diagonal_gaussian_head(x):
    assert x.shape[-1] == action_size * 2
    mean, log_scale = torch.chunk(x, 2, dim=1)
    log_scale = torch.clamp(log_scale, -20.0, 2.0)
    var = torch.exp(log_scale * 2)
    base_distribution = distributions.Independent(
        distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
    )
    # cache_size=1 is required for numerical stability
    return distributions.transformed_distribution.TransformedDistribution(
        base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
    )

In [9]:
class PolicyFunction(nn.Module):
    def __init__(self, width, height, action_size):
        super().__init__()

        # RGB Image tensor as input
        self.selectTrackFeatures, self.linear_input_size = make_conv2d_layer(width, height)
        self.fc1 = make_linear_layer(self.linear_input_size, action_size*2)
    
    def forward(self, state):
        x = self.selectTrackFeatures(state)
        x = self.fc1(x)
        return squashed_diagonal_gaussian_head(x)

policy = PolicyFunction(obs_space.shape[1], obs_space.shape[2], action_size)
policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4)

23104


In [10]:
# print(obs_space.sample().shape)
# policy(torch.from_numpy(obs_space.sample()).unsqueeze(0))

In [11]:
class QFunction(nn.Module):
    def __init__(self, width, height, action_size):
        super().__init__()

        # RGB Image tensor as input
        self.selectTrackFeatures, self.linear_input_size = make_conv2d_layer(width, height)
        self.fc1 = make_linear_layer(self.linear_input_size+action_size, 1)
    
    def forward(self, state_and_action):
        state = self.selectTrackFeatures(state_and_action[0])
        x = torch.cat((state, state_and_action[1]), dim=-1)
        return self.fc1(x)

q_func1 = QFunction(obs_space.shape[1], obs_space.shape[2], action_size)
q_func2 = QFunction(obs_space.shape[1], obs_space.shape[2], action_size)
q_func1_optimizer = torch.optim.Adam(q_func1.parameters(), lr=3e-4)
q_func2_optimizer = torch.optim.Adam(q_func2.parameters(), lr=3e-4)


23104
23104


In [12]:
# obs = torch.from_numpy(obs_space.sample()).unsqueeze(0)
# print(obs.shape)
# action = torch.from_numpy(action_space.sample()).unsqueeze(0)
# print(action.shape)
# q_func1((obs, action))

In [13]:
rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6)

In [15]:
def burnin_action_func():
    """Select random actions until model is updated one or more times."""
    return np.random.uniform(action_space.low, action_space.high).astype(np.float32)

In [16]:
gamma = 0.99
replay_start_size = 10000
gpu = 0
batch_size = 256
entropy_target = -action_size
temperature_optimizer_lr = 3e-4

agent = pfrl.agents.SoftActorCritic(
    policy,
    q_func1,
    q_func2,
    policy_optimizer,
    q_func1_optimizer,
    q_func2_optimizer,
    rbuf,
    gamma=gamma,
    replay_start_size=replay_start_size,
    gpu=gpu,
    minibatch_size=batch_size,
    burnin_action_func=burnin_action_func,
    entropy_target=entropy_target,
    temperature_optimizer_lr=temperature_optimizer_lr,
)

In [20]:
n_episodes = 50
max_episode_len = 1000

for i in range(1, n_episodes + 1):
    obs = env.reset()
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while True:
        # Uncomment to watch the behavior in a GUI window
        # env.render()
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        # print(f"action: {action}, reward: {reward}")
        if done or reset:
            break
    if i % 10 == 0:
        print('episode:', i, 'R:', R, '\nstatistics:', agent.get_statistics())

print('Finished.')

Track generation: 946..1189 -> 243-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 995..1254 -> 259-tiles track
Track generation: 1105..1385 -> 280-tiles track
Track generation: 1147..1437 -> 290-tiles track
Track generation: 1155..1448 -> 293-tiles track
Track generation: 1076..1349 -> 273-tiles track
Track generation: 1091..1371 -> 280-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1014..1279 -> 265-tiles track
Track generation: 1250..1574 -> 324-tiles track
Track generation: 1139..1428 -> 289-tiles track
Track generation: 1239..1557 -> 318-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1202..1507 -> 305-tiles track
Track generation: 1278..1604 -> 326-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1171..1468 -> 297-tiles track
episode: 1

### Random

In [16]:
done = False
frames = []
obs = env.reset()
total_r = 0

while not done:
    action = -1+np.random.rand(3)*2
    obs, r, done, info = env.step(action)
    total_r += r
    frames.append(env.render(mode='rgb_array'))

print('R:', total_r)
display_frames_as_gif(frames, 'CarRacing_Random.gif')

Track generation: 1171..1468 -> 297-tiles track
R: -83.10810810810761


### Trained

In [21]:
done = False
frames = []
obs = env.reset()
total_r = 0

with agent.eval_mode():
    while not done:
        action = agent.act(obs)
        obs, r, done, info = env.step(action)
        total_r += r
        agent.observe(obs, r, done, reset)
        frames.append(env.render(mode='rgb_array'))
print('R:', total_r)
display_frames_as_gif(frames)

Track generation: 1105..1393 -> 288-tiles track
R: 35.888501742161026
