In [1]:
pip install gym

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install gym_super_mario_bros

Note: you may need to restart the kernel to use updated packages.


In [None]:
import copy
import os
import random
from collections import deque
import gym
import gym_super_mario_bros
import matplotlib.pyplot as plt
import numpy as np
import torch
from gym.spaces import Box
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
from torch import nn
from torchvision import transforms


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(low=0, high=255, shape=self.observation_space.shape[:2], dtype=np.uint8)

    def observation(self, observation):
        transform = transforms.Grayscale()
        return transform(torch.tensor(np.transpose(observation, (2, 0, 1)).copy(), dtype=torch.float))


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transformations = transforms.Compose([transforms.Resize(self.shape), transforms.Normalize(0, 255)])
        return transformations(observation).squeeze(0)


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, [["right"], ["right", "A"]])
env = FrameStack(ResizeObservation(GrayScaleObservation(SkipFrame(env, skip=4)), shape=84), num_stack=4)



class DDQNSolver(nn.Module):
    def __init__(self, output_dim):
        super().__init__()
        self.online = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )
        self.target = copy.deepcopy(self.online)
        for p in self.target.parameters():
            p.requires_grad = False

    def forward(self, input, model):
        if model == "online":
            return self.online(input)
        elif model == "target":
            return self.target(input)


class DDQNAgent:
    def __init__(self, action_dim, save_directory):
        self.action_dim = action_dim
        self.save_directory = save_directory
        self.net = DDQNSolver(self.action_dim).cuda()
        self.exploration_rate = 1.0
        self.exploration_rate_decay = 0.99
        self.exploration_rate_min = 0.01
        self.current_step = 0
        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.gamma = 0.95
        self.sync_period = 1e4
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025, eps=1e-4)
        self.loss = torch.nn.SmoothL1Loss()
        self.episode_rewards = []
        self.moving_average_episode_rewards = []
        self.current_episode_reward = 0.0

    def log_episode(self):
        self.episode_rewards.append(self.current_episode_reward)
        self.current_episode_reward = 0.0

    def log_period(self, episode, epsilon, step):
        self.moving_average_episode_rewards.append(np.round(np.mean(self.episode_rewards[-checkpoint_period:]), 3))
        print(f"Episode {episode} - Step {step} - Epsilon {epsilon} - Mean Reward {self.moving_average_episode_rewards[-1]}")
        plt.plot(self.moving_average_episode_rewards)
        plt.savefig(os.path.join(self.save_directory, f"episode_rewards_plot_{episode}.png"))
        plt.clf()

    def remember(self, state, next_state, action, reward, done):
        self.memory.append((torch.tensor(state.__array__()), torch.tensor(next_state.__array__()),
                            torch.tensor([action]), torch.tensor([reward]), torch.tensor([done])))

    def experience_replay(self, step_reward):
        self.current_episode_reward += step_reward
        if self.current_step % self.sync_period == 0:
            self.net.target.load_state_dict(self.net.online.state_dict())
        if self.batch_size > len(self.memory):
            return
        state, next_state, action, reward, done = self.recall()
        q_estimate = self.net(state.cuda(), model="online")[np.arange(0, self.batch_size), action.cuda()]
        with torch.no_grad():
            best_action = torch.argmax(self.net(next_state.cuda(), model="online"), dim=1)
            next_q = self.net(next_state.cuda(), model="target")[np.arange(0, self.batch_size), best_action]
            q_target = (reward.cuda() + (1 - done.cuda().float()) * self.gamma * next_q).float()
        loss = self.loss(q_estimate, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def recall(self):
        state, next_state, action, reward, done = map(torch.stack, zip(*random.sample(self.memory, self.batch_size)))
        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            action = np.random.randint(self.action_dim)
        else:
            action_values = self.net(torch.tensor(state.__array__()).cuda().unsqueeze(0), model="online")
            action = torch.argmax(action_values, dim=1).item()
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
        self.current_step += 1
        return action

    def load_checkpoint(self, path):
        checkpoint = torch.load(path)
        self.net.load_state_dict(checkpoint['model'])
        self.exploration_rate = checkpoint['exploration_rate']

    def save_checkpoint(self):
        filename = os.path.join(self.save_directory, 'checkpoint_{}.pth'.format(episode))
        torch.save(dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate), f=filename)
        print('Checkpoint saved to \'{}\''.format(filename))


checkpoint_period = 10
save_directory = "mario_ql"
load_checkpoint ="checkpoint_9710.pth"
agent = DDQNAgent(action_dim=env.action_space.n, save_directory=save_directory)
if load_checkpoint is not None:
    agent.load_checkpoint(save_directory + "/" + load_checkpoint)
episode = 0
while True:
    state = env.reset()
    while True:
        action = agent.act(state)
        env.render()
        next_state, reward, done, info = env.step(action)
        agent.remember(state, next_state, action, reward, done)
        agent.experience_replay(reward)
        state = next_state
        if done:
            episode += 1
            agent.log_episode()
            if episode % checkpoint_period == 0:
                agent.log_period(episode=episode, epsilon=agent.exploration_rate, step=agent.current_step)
                agent.save_checkpoint()
            break

  logger.warn(
  torch.tensor([action]), torch.tensor([reward]), torch.tensor([done])))


Episode 10 - Step 2635 - Epsilon 0.01 - Mean Reward 1717.7
Checkpoint saved to 'mario_ql\checkpoint_10.pth'
Episode 20 - Step 5576 - Epsilon 0.01 - Mean Reward 1359.4
Checkpoint saved to 'mario_ql\checkpoint_20.pth'
Episode 30 - Step 7412 - Epsilon 0.01 - Mean Reward 1022.2
Checkpoint saved to 'mario_ql\checkpoint_30.pth'
Episode 40 - Step 9975 - Epsilon 0.01 - Mean Reward 1493.5
Checkpoint saved to 'mario_ql\checkpoint_40.pth'
Episode 50 - Step 13028 - Epsilon 0.01 - Mean Reward 1540.8
Checkpoint saved to 'mario_ql\checkpoint_50.pth'
Episode 60 - Step 16615 - Epsilon 0.01 - Mean Reward 1729.9
Checkpoint saved to 'mario_ql\checkpoint_60.pth'
Episode 70 - Step 19887 - Epsilon 0.01 - Mean Reward 1695.5
Checkpoint saved to 'mario_ql\checkpoint_70.pth'
Episode 80 - Step 23759 - Epsilon 0.01 - Mean Reward 1615.6
Checkpoint saved to 'mario_ql\checkpoint_80.pth'
Episode 90 - Step 26755 - Epsilon 0.01 - Mean Reward 1843.0
Checkpoint saved to 'mario_ql\checkpoint_90.pth'
Episode 100 - Step 2994

Episode 750 - Step 256239 - Epsilon 0.01 - Mean Reward 1882.6
Checkpoint saved to 'mario_ql\checkpoint_750.pth'
Episode 760 - Step 259649 - Epsilon 0.01 - Mean Reward 1969.0
Checkpoint saved to 'mario_ql\checkpoint_760.pth'
Episode 770 - Step 262589 - Epsilon 0.01 - Mean Reward 1576.5
Checkpoint saved to 'mario_ql\checkpoint_770.pth'
Episode 780 - Step 267416 - Epsilon 0.01 - Mean Reward 1884.9
Checkpoint saved to 'mario_ql\checkpoint_780.pth'
Episode 790 - Step 270768 - Epsilon 0.01 - Mean Reward 2056.4
Checkpoint saved to 'mario_ql\checkpoint_790.pth'
Episode 800 - Step 276207 - Epsilon 0.01 - Mean Reward 2492.6
Checkpoint saved to 'mario_ql\checkpoint_800.pth'
Episode 810 - Step 279985 - Epsilon 0.01 - Mean Reward 1512.2
Checkpoint saved to 'mario_ql\checkpoint_810.pth'
Episode 820 - Step 286090 - Epsilon 0.01 - Mean Reward 2721.4
Checkpoint saved to 'mario_ql\checkpoint_820.pth'
Episode 830 - Step 290370 - Epsilon 0.01 - Mean Reward 2540.7
Checkpoint saved to 'mario_ql\checkpoint_8

In [4]:
import os
from gym.wrappers import FrameStack
from torchvision import transforms
import gym_super_mario_bros
import numpy as np
import torch
import torch.nn as nn
from nes_py.wrappers import JoypadSpace
from torch.distributions import Categorical
from gym.spaces import Box
import matplotlib.pyplot as plt
import gym


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(low=0, high=255, shape=self.observation_space.shape[:2], dtype=np.uint8)

    def observation(self, observation):
        transform = transforms.Grayscale()
        return transform(torch.tensor(np.transpose(observation, (2, 0, 1)).copy(), dtype=torch.float))


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transformations = transforms.Compose([transforms.Resize(self.shape), transforms.Normalize(0, 255)])
        return transformations(observation).squeeze(0)


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, [["right"], ["right", "A"]])
env = FrameStack(ResizeObservation(GrayScaleObservation(SkipFrame(env, skip=4)), shape=84), num_stack=4)
env.seed(42)
env.action_space.seed(42)
torch.manual_seed(42)
torch.random.manual_seed(42)
np.random.seed(42)


class MarioSolver:
    def __init__(self, learning_rate):
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n),
            nn.Softmax(dim=-1)
        ).cuda()
        
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate, eps=1e-4)
        self.reset()

    def forward(self, x):
        return self.model(x)

    def reset(self):
        self.episode_actions = torch.tensor([], requires_grad=True).cuda()
        self.episode_rewards = []

    def save_checkpoint(self, directory, episode):
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = os.path.join(directory, 'checkpoint_{}.pth'.format(episode))
        torch.save(self.model.state_dict(), f=filename)
        print('Checkpoint saved to \'{}\''.format(filename))

    def load_checkpoint(self, directory, filename):
        self.model.load_state_dict(torch.load(os.path.join(directory, filename)))
        print('Resuming training from checkpoint \'{}\'.'.format(filename))
        return int(filename[11:-4])

    def backward(self):
        future_reward = 0
        rewards = []
        for r in self.episode_rewards[::-1]:
            future_reward = r + gamma * future_reward
            rewards.append(future_reward)
        rewards = torch.tensor(rewards[::-1], dtype=torch.float32).cuda()
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        loss = torch.sum(torch.mul(self.episode_actions, rewards).mul(-1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.reset()


batch_size = 10
gamma = 0.95
load_filename = None
save_directory = "./MARIO_DQN"
batch_rewards = []
episode = 0

model = MarioSolver(learning_rate=0.00025)
if load_filename is not None:
    episode = model.load_checkpoint(save_directory, load_filename)
all_episode_rewards = []
all_mean_rewards = []
while True:
    observation = env.reset()
    done = False
    while not done:
        env.render()
        observation = torch.tensor(observation.__array__()).cuda().unsqueeze(0)
        distribution = Categorical(model.forward(observation))
        action = distribution.sample()
        observation, reward, done, _ = env.step(action.item())
        model.episode_actions = torch.cat([model.episode_actions, distribution.log_prob(action).reshape(1)])
        model.episode_rewards.append(reward)
        if done:
            all_episode_rewards.append(np.sum(model.episode_rewards))
            batch_rewards.append(np.sum(model.episode_rewards))
            model.backward()
            episode += 1
            if episode % batch_size == 0:
                print('Batch: {}, average reward: {}'.format(episode // batch_size, np.array(batch_rewards).mean()))
                batch_rewards = []
                all_mean_rewards.append(np.mean(all_episode_rewards[-batch_size:]))
                plt.plot(all_mean_rewards)
                plt.savefig("{}/mean_reward_{}.png".format(save_directory, episode))
                plt.clf()
            if episode % 50 == 0 and save_directory is not None:
                model.save_checkpoint(save_directory, episode)

Batch: 1, average reward: 766.1
Batch: 2, average reward: 438.3
Batch: 3, average reward: 748.3
Batch: 4, average reward: 787.1
Batch: 5, average reward: 725.7
Checkpoint saved to './MARIO_DQN\checkpoint_50.pth'
Batch: 6, average reward: 730.3
Batch: 7, average reward: 556.0
Batch: 8, average reward: 711.6
Batch: 9, average reward: 591.9
Batch: 10, average reward: 648.4
Checkpoint saved to './MARIO_DQN\checkpoint_100.pth'
Batch: 11, average reward: 903.8
Batch: 12, average reward: 600.7
Batch: 13, average reward: 705.5
Batch: 14, average reward: 570.4
Batch: 15, average reward: 596.2
Checkpoint saved to './MARIO_DQN\checkpoint_150.pth'
Batch: 16, average reward: 559.9
Batch: 17, average reward: 904.7
Batch: 18, average reward: 754.7
Batch: 19, average reward: 828.6
Batch: 20, average reward: 430.6
Checkpoint saved to './MARIO_DQN\checkpoint_200.pth'
Batch: 21, average reward: 508.3
Batch: 22, average reward: 626.3
Batch: 23, average reward: 528.3
Batch: 24, average reward: 670.4
Batch:

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>

In [1]:
import os

from torchvision import transforms
import gym
from gym.spaces import Box
import gym_super_mario_bros
import numpy as np
import torch
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
from torch import nn
from torch.distributions import Categorical
import matplotlib.pyplot as plt
device = torch.device("cuda")


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = Box(low=0, high=255, shape=self.observation_space.shape[:2], dtype=np.uint8)

    def observation(self, observation):
        transform = transforms.Grayscale()
        return transform(torch.tensor(np.transpose(observation, (2, 0, 1)).copy(), dtype=torch.float))


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transformations = transforms.Compose([transforms.Resize(self.shape), transforms.Normalize(0, 255)])
        return transformations(observation).squeeze(0)


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, [["right"], ["right", "A"]])
env = FrameStack(ResizeObservation(GrayScaleObservation(SkipFrame(env, skip=4)), shape=84), num_stack=4)
env.seed(42)
env.action_space.seed(42)
torch.manual_seed(42)
torch.random.manual_seed(42)
np.random.seed(42)


class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n)
        )
        self.critic = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, obs):
        return Categorical(logits=self.actor(obs)), self.critic(obs).reshape(-1)


class PPOSolver:
    def __init__(self):
        self.rewards = []
        self.gamma = 0.95
        self.lamda = 0.95
        self.worker_steps = 4096
        self.n_mini_batch = 4
        self.epochs = 30
        self.save_directory = "./mario_ppo"
        self.batch_size = self.worker_steps
        self.mini_batch_size = self.batch_size // self.n_mini_batch
        self.obs = env.reset().__array__()
        self.policy = Model().to(device)
        self.mse_loss = nn.MSELoss()
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': 0.00025},
            {'params': self.policy.critic.parameters(), 'lr': 0.001}
        ], eps=1e-4)
        self.policy_old = Model().to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.all_episode_rewards = []
        self.all_mean_rewards = []
        self.episode = 0

    def save_checkpoint(self):
        filename = os.path.join(self.save_directory, 'checkpoint_{}.pth'.format(self.episode))
        torch.save(self.policy_old.state_dict(), f=filename)
        print('Checkpoint saved to \'{}\''.format(filename))

    def load_checkpoint(self, filename):
        self.policy.load_state_dict(torch.load(os.path.join(self.save_directory, filename)))
        self.policy_old.load_state_dict(torch.load(os.path.join(self.save_directory, filename)))
        print('Resuming training from checkpoint \'{}\'.'.format(filename))

    def sample(self):
        rewards = np.zeros(self.worker_steps, dtype=np.float32)
        actions = np.zeros(self.worker_steps, dtype=np.int32)
        done = np.zeros(self.worker_steps, dtype=bool)
        obs = np.zeros((self.worker_steps, 4, 84, 84), dtype=np.float32)
        log_pis = np.zeros(self.worker_steps, dtype=np.float32)
        values = np.zeros(self.worker_steps, dtype=np.float32)
        for t in range(self.worker_steps):
            with torch.no_grad():
                obs[t] = self.obs
                pi, v = self.policy_old(torch.tensor(self.obs, dtype=torch.float32, device=device).unsqueeze(0))
                values[t] = v.cpu().numpy()
                a = pi.sample()
                actions[t] = a.cpu().numpy()
                log_pis[t] = pi.log_prob(a).cpu().numpy()
            self.obs, rewards[t], done[t], _ = env.step(actions[t])
            self.obs = self.obs.__array__()
            env.render()
            self.rewards.append(rewards[t])
            if done[t]:
                self.episode += 1
                self.all_episode_rewards.append(np.sum(self.rewards))
                self.rewards = []
                env.reset()
                if self.episode % 10 == 0:
                    print('Episode: {}, average reward: {}'.format(self.episode, np.mean(self.all_episode_rewards[-10:])))
                    self.all_mean_rewards.append(np.mean(self.all_episode_rewards[-10:]))
                    plt.plot(self.all_mean_rewards)
                    plt.savefig("{}/mean_reward_{}.png".format(self.save_directory, self.episode))
                    plt.clf()
                    self.save_checkpoint()
        returns, advantages = self.calculate_advantages(done, rewards, values)
        return {
            'obs': torch.tensor(obs.reshape(obs.shape[0], *obs.shape[1:]), dtype=torch.float32, device=device),
            'actions': torch.tensor(actions, device=device),
            'values': torch.tensor(values, device=device),
            'log_pis': torch.tensor(log_pis, device=device),
            'advantages': torch.tensor(advantages, device=device, dtype=torch.float32),
            'returns': torch.tensor(returns, device=device, dtype=torch.float32)
        }

    def calculate_advantages(self, done, rewards, values):
        _, last_value = self.policy_old(torch.tensor(self.obs, dtype=torch.float32, device=device).unsqueeze(0))
        last_value = last_value.cpu().data.numpy()
        values = np.append(values, last_value)
        returns = []
        gae = 0
        for i in reversed(range(len(rewards))):
            mask = 1.0 - done[i]
            delta = rewards[i] + self.gamma * values[i + 1] * mask - values[i]
            gae = delta + self.gamma * self.lamda * mask * gae
            returns.insert(0, gae + values[i])
        adv = np.array(returns) - values[:-1]
        return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-8)

    def train(self, samples, clip_range):
        indexes = torch.randperm(self.batch_size)
        for start in range(0, self.batch_size, self.mini_batch_size):
            end = start + self.mini_batch_size
            mini_batch_indexes = indexes[start: end]
            mini_batch = {}
            for k, v in samples.items():
                mini_batch[k] = v[mini_batch_indexes]
            for _ in range(self.epochs):
                loss = self.calculate_loss(clip_range=clip_range, samples=mini_batch)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            self.policy_old.load_state_dict(self.policy.state_dict())

    def calculate_loss(self, samples, clip_range):
        sampled_returns = samples['returns']
        sampled_advantages = samples['advantages']
        pi, value = self.policy(samples['obs'])
        ratio = torch.exp(pi.log_prob(samples['actions']) - samples['log_pis'])
        clipped_ratio = ratio.clamp(min=1.0 - clip_range, max=1.0 + clip_range)
        policy_reward = torch.min(ratio * sampled_advantages, clipped_ratio * sampled_advantages)
        entropy_bonus = pi.entropy()
        vf_loss = self.mse_loss(value, sampled_returns)
        loss = -policy_reward + 0.5 * vf_loss - 0.01 * entropy_bonus
        return loss.mean()


solver = PPOSolver()
while True:
    solver.train(solver.sample(), 0.2)

Episode: 10, average reward: 667.5
Checkpoint saved to './mario_ppo\checkpoint_10.pth'
Episode: 20, average reward: 788.9000244140625
Checkpoint saved to './mario_ppo\checkpoint_20.pth'
Episode: 30, average reward: 585.5
Checkpoint saved to './mario_ppo\checkpoint_30.pth'
Episode: 40, average reward: 560.0999755859375
Checkpoint saved to './mario_ppo\checkpoint_40.pth'
Episode: 50, average reward: 517.2000122070312
Checkpoint saved to './mario_ppo\checkpoint_50.pth'
Episode: 60, average reward: 863.5
Checkpoint saved to './mario_ppo\checkpoint_60.pth'
Episode: 70, average reward: 682.5999755859375
Checkpoint saved to './mario_ppo\checkpoint_70.pth'
Episode: 80, average reward: 479.3999938964844
Checkpoint saved to './mario_ppo\checkpoint_80.pth'
Episode: 90, average reward: 743.4000244140625
Checkpoint saved to './mario_ppo\checkpoint_90.pth'
Episode: 100, average reward: 634.7999877929688
Checkpoint saved to './mario_ppo\checkpoint_100.pth'
Episode: 110, average reward: 694.7000122070

Episode: 850, average reward: 862.9000244140625
Checkpoint saved to './mario_ppo\checkpoint_850.pth'
Episode: 860, average reward: 594.9000244140625
Checkpoint saved to './mario_ppo\checkpoint_860.pth'
Episode: 870, average reward: 605.2000122070312
Checkpoint saved to './mario_ppo\checkpoint_870.pth'
Episode: 880, average reward: 711.5999755859375
Checkpoint saved to './mario_ppo\checkpoint_880.pth'
Episode: 890, average reward: 811.0999755859375
Checkpoint saved to './mario_ppo\checkpoint_890.pth'
Episode: 900, average reward: 1017.0
Checkpoint saved to './mario_ppo\checkpoint_900.pth'
Episode: 910, average reward: 841.0
Checkpoint saved to './mario_ppo\checkpoint_910.pth'
Episode: 920, average reward: 876.2000122070312
Checkpoint saved to './mario_ppo\checkpoint_920.pth'
Episode: 930, average reward: 976.7999877929688
Checkpoint saved to './mario_ppo\checkpoint_930.pth'
Episode: 940, average reward: 967.2999877929688
Checkpoint saved to './mario_ppo\checkpoint_940.pth'
Episode: 950,

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>