In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#imports

import matplotlib.pyplot as plt
import numpy as np
import numpy
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple
import math
import argparse
%pip install -U gym>=0.21.0
%pip install -U gym[atari,accept-rom-license]
import gym
from IPython import display
from gym import wrappers

In [None]:
#config.py

Pong = {
    'obs_stack_size': 4,
    'memory_size': 50000,
    'n_episodes': 10000,  
    'batch_size': 32,
    'target_update_frequency': 1000,
    'train_frequency': 4,
    'gamma': 0.99,
    'lr': 1e-4,
    'eps_start': 1.0,
    'eps_end': 0.01,
    'anneal_length': 10**6,
    'n_actions': 2,  # 2 if we do action mapping, otherwise 6
}

In [None]:
#util

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess(obs, env):
    """Performs necessary observation preprocessing."""
    if env in ['Pong-v0']:
        return torch.tensor(obs, device=device).float()
    else:
        raise ValueError('Please add necessary observation preprocessing instructions to preprocess() in utils.py.')

In [None]:
#dqn

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def __len__(self):
        return len(self.memory)

    def push(self, obs, action, next_obs, reward):
        if len(self.memory) < self.capacity:
            self.memory.append(None)

        self.memory[self.position] = (obs, action, next_obs, reward)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """
        Samples batch_size transitions from the replay memory and returns a tuple
            (obs, action, next_obs, reward)
        """
        sample = random.sample(self.memory, batch_size)
        return tuple(zip(*sample))


class DQN(nn.Module):
    def __init__(self, env_config):
        super(DQN, self).__init__()

        # Save hyperparameters needed in the DQN class.
        self.batch_size = env_config["batch_size"]
        self.gamma = env_config["gamma"]
        self.eps_start = env_config["eps_start"]
        self.eps_threshold = self.eps_start
        self.eps_end = env_config["eps_end"]
        self.anneal_length = env_config["anneal_length"]
        self.n_actions = env_config["n_actions"]

        self.total_episodes = env_config["n_episodes"]
        self.episodes_done = 0

        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        self.fc1 = nn.Linear(3136, 512)
        self.fc2 = nn.Linear(512, self.n_actions)

        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()

    def forward(self, x):
        """Runs the forward pass of the NN depending on architecture."""
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        #print("Shape after forward: ", x.shape)
        return x

    def act(self, observation, exploit=False):
        """Selects an action with an epsilon-greedy exploration strategy."""
        # TODO: Implement action selection using the Deep Q-network. This function
        #       takes an observation tensor and should return a tensor of actions.
        #       For example, if the state dimension is 4 and the batch size is 32,
        #       the input would be a [32, 4] tensor and the output a [32, 1] tensor.
        # TODO: Implement epsilon-greedy exploration.

        # Approx. 1300 steps/episode (could also measure self.episodes)
        #eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp(-1. * self.steps_done / 500000)
        #print(eps_threshold)
        #self.steps_done += 1
        self.eps_start = self.eps_start * 0.9999999
        if self.eps_start < self.eps_end:
          self.eps_start = max(self.eps_start, self.eps_end)
        if random.random() < self.eps_start and not exploit:
            return torch.tensor(random.randint(2, 3)).unsqueeze(0).to(device)
        else:
            q_values = self.forward(observation)
            #print("q_values: ", q_values)
            action = torch.argmax(q_values,1)+2
            #print("computed action: ", action)
            return action

def optimize(dqn, target_dqn, memory, optimizer):
    """This function samples a batch from the replay buffer and optimizes the Q-network."""
    # If we don't have enough transitions stored yet, we don't train.
    if len(memory) < dqn.batch_size:
        return

    # TODO: Sample a batch from the replay memory and concatenate so that there are
    #       four tensors in total: observations, actions, next observations and rewards.
    #       Remember to move them to GPU if it is available, e.g., by using Tensor.to(device).
    #       Note that special care is needed for terminal transitions!
    transitions = memory.sample(dqn.batch_size)
    obs_batch = torch.cat(transitions[0]).to(device)
    action_batch = torch.cat(transitions[1]).to(device)
    non_terminal_next_obs = [s for s in transitions[2] if s.size(1) == 4]
    next_obs_batch = torch.cat(non_terminal_next_obs).to(device)
    reward_batch = torch.cat(transitions[3]).to(device)

    # TODO: Compute the current estimates of the Q-values for each state-action
    #       pair (s,a). Here, torch.gather() is useful for selecting the Q-values
    #       corresponding to the chosen actions.
    all_q_values = dqn.forward(obs_batch)
    action_indices = (action_batch - 2).unsqueeze(1)
    q_values = torch.gather(all_q_values, 1, action_indices)
    
    # TODO: Compute the Q-value targets. Only do this for non-terminal transitions!
    
    all_next_state_values = torch.zeros(dqn.batch_size, dqn.n_actions, device=device)
    
    is_non_terminal = torch.tensor(tuple(map(lambda s: s.size(1) == 4,
                                            transitions[2])), device=device, dtype=torch.bool)
    
    all_next_state_values[is_non_terminal] = target_dqn.forward(next_obs_batch)
    best_next_state_values = all_next_state_values.max(1)[0]
    q_value_targets = dqn.gamma * best_next_state_values + reward_batch
    
    # Compute loss.
    loss = F.mse_loss(q_values.squeeze(), q_value_targets)

    # Perform gradient descent.
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    return loss.item()

In [None]:
#evaluate

"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

parser = argparse.ArgumentParser()
parser.add_argument('--env', choices=['Pong'])
parser.add_argument('--path', type=str, help='Path to stored DQN model.')
parser.add_argument('--n_eval_episodes', type=int, default=1, help='Number of evaluation episodes.', nargs='?')
parser.add_argument('--render', dest='render', action='store_true', help='Render the environment.')
parser.add_argument('--save_video', dest='save_video', action='store_true', help='Save the episodes as video.')
parser.set_defaults(render=False)
parser.set_defaults(save_video=False)

# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'Pong-v0': Pong,
}
"""

def evaluate_policy(dqn, env, env_config, args, n_episodes, render=False, verbose=False):
    """Runs {n_episodes} episodes to evaluate current policy."""
    
    total_return = 0

    for i in range(n_episodes):
        obs = preprocess(env.reset(), env=args["env"]).unsqueeze(0)
        obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device)

        done = False
        episode_return = 0

        while not done:
            if render:
                env.render()

            action = dqn.act(obs_stack, exploit=True).item()
            #print("eval action: ", action)

            obs, reward, done, info = env.step(action)
            obs = preprocess(obs, env=args["env"]).unsqueeze(0)
            
            if not done:
                obs_stack = torch.cat((obs_stack[:, 1:, ...], obs.unsqueeze(1)), dim=1).to(device)
            #obs = preprocess(obs, env=args.env).unsqueeze(0)

            episode_return += reward
        
        total_return += episode_return
        
        #if verbose:
            #print(f'Finished episode {i+1} with a total return of {episode_return}')

    
    return total_return / n_episodes

In [None]:
#train

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

"""
parser = argparse.ArgumentParser()
parser.add_argument('--env', choices=['Pong-v0'])
parser.add_argument('--evaluate_freq', type=int, default=25, help='How often to run evaluation.', nargs='?')
parser.add_argument('--evaluation_episodes', type=int, default=5, help='Number of evaluation episodes.', nargs='?')
"""

args = { "evaluate_freq": 25, "evaluation_episodes":5, "env":'Pong-v0' }

# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'Pong-v0': Pong
}

#args = parser.parse_args()

# Initialize environment and config.
env = gym.make(args["env"])
env_config = ENV_CONFIGS[args["env"]]

env  =  gym.wrappers.AtariPreprocessing (env, screen_size=84 , grayscale_obs=True , frame_skip=1 , noop_max=30, scale_obs=True)

# Initialize deep Q-networks.
dqn = DQN(env_config=env_config).to(device)
# dqn = torch.load('drive/MyDrive/Pong/Pong-v0_best.pt', map_location=torch.device(device))

# TODO: Create and initialize target Q-network.
target_dqn = DQN(env_config=env_config).to(device)

# Create replay memory.
memory = ReplayMemory(env_config['memory_size'])

# Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

# Keep track of best evaluation mean return achieved so far.
best_mean_return = -float("Inf")
mean_return_history = []

for episode in range(dqn.episodes_done, dqn.total_episodes):
    
    done = False
    steps = 0
    dqn.eps_threshold = dqn.eps_end + (dqn.eps_start - dqn.eps_end) * math.exp(-10 * dqn.episodes_done / dqn.total_episodes)
    print(dqn.eps_threshold)
    
    obs = preprocess(env.reset(), env=args["env"]).unsqueeze(0)
    obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device)
        
    while not done:
        # TODO: Get action from DQN.
        action = dqn.act(obs_stack)
        #print("action: ", action)

        # Act in the true environment.
        obs, reward, done, info = env.step(action)
        steps += 1

        # Preprocess incoming observation.
        old_obs_stack = obs_stack
        if not done:
            obs = preprocess(obs, env=args["env"]).unsqueeze(0)
            obs_stack = torch.cat((obs_stack[:, 1:, ...], obs.unsqueeze(1)), dim=1).to(device)
        else:
            #obs = torch.zeros(1,84,84)
            #print(obs.shape)
            #obs_stack = torch.cat((obs_stack[:, 1:, ...], obs.unsqueeze(1)), dim=1).to(device)
            obs_stack = obs_stack[:, 1:, ...]
            #print(obs_stack.shape)
            
            
        # TODO: Add the transition to the replay memory. Remember to convert
        #       everything to PyTorch tensors!
        reward = torch.tensor(reward).unsqueeze(0)
        memory.push(old_obs_stack, action, obs_stack, reward)

         # TODO: Run DQN.optimize() every env_config["train_frequency"] steps.
        if steps % env_config["train_frequency"] == 0:
            optimize(dqn, target_dqn, memory, optimizer)

        # TODO: Update the target network every env_config["target_update_frequency"] steps.
        if steps % env_config["target_update_frequency"] == 0:
            target_dqn.load_state_dict(dqn.state_dict())
    # Evaluate the current agent.
    if episode % args["evaluate_freq"] == 0:
        print(dqn.eps_start)
        mean_return = evaluate_policy(dqn, env, env_config, args, n_episodes=args["evaluation_episodes"], verbose=True)
        mean_return_history.append(mean_return)
        f = plt.figure()
        plt.plot(range(0, len(mean_return_history) * args["evaluate_freq"], args["evaluate_freq"]), mean_return_history)
        plt.xlabel("Episode")
        plt.ylabel("Mean return")
        plt.title("Mean return over episodes")
        f.savefig(f'/content/drive/My Drive/Reinforcement_Learning_Atari/models_Pong/{args["env"]}_model_tuf_{env_config["target_update_frequency"]}.png')
    
        print(f'Episode {episode}/{env_config["n_episodes"]}: {mean_return}')

        # Save current agent if it has the best performance so far.
        if mean_return >= best_mean_return:
            best_mean_return = mean_return

            print('Best performance so far! Saving model.')
            torch.save(dqn, f'temp_best.pt')
            torch.save(dqn, f'/content/drive/My Drive/Reinforcement_Learning_Atari/models_Pong/{args["env"]}_best.pt')

    dqn.episodes_done += 1        
# Close environment after training is completed.
env.close()

In [None]:

args = { "evaluate_freq": 25, "evaluation_episodes":5, "env":'Pong-v0' }

# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'Pong-v0': Pong
}
dqn = torch.load('drive/MyDrive/Reinforcement_Learning_Atari/models_Pong/Pong-v0_best.pt', map_location=torch.device(device))
env = gym.make('Pong-v0')
env = gym.wrappers.AtariPreprocessing ( env , screen_size = 84 , grayscale_obs = True , frame_skip = 1 , noop_max = 30,scale_obs=True )
env.reset()
env_config = ENV_CONFIGS["Pong-v0"]
args = { "evaluate_freq": 25, "evaluation_episodes":5, "env":'Pong-v0' }



done = False
obs = preprocess(env.reset(), env=args["env"]).unsqueeze(0)
obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device)
while not done:
    frame = env.render(mode="rgb_array")
    plt.imshow(frame)
    plt.show() 
    display.clear_output(wait=True)
    action = dqn.act(obs_stack,exploit=True)
    print(action)
    obs, reward, done, info = env.step(action)
    if not done:
        old_obs_stack = obs_stack
            
        obs = preprocess(obs, env=args["env"]).unsqueeze(0)
        obs_stack = torch.cat((obs_stack[:, 1:, ...], obs.unsqueeze(1)), dim=1).to(device)
env.close()