In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#imports
from IPython.display import clear_output
import matplotlib.pyplot as plt
import numpy as np
import numpy
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple
import math
import argparse
%pip install -U gym>=0.21.0
%pip install -U gym[atari,accept-rom-license]
import gym


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting ale-py~=0.7.5
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 33.3 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (PEP 517) ... [?25l[?25hdone
  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.4.2-py3-none-any.whl size=441027 sha256=a62e851ec92bf582d932b17cff65194987db08c4b58d575a1f1506997de629fb
  Stored in directory:

  f"Custom namespace `{spec.namespace}` is being overridden "


In [4]:
#config.py

"""
In this file, you may edit the hyperparameters used for different environments.
memory_size: Maximum size of the replay memory.
n_episodes: Number of episodes to train for.
batch_size: Batch size used for training DQN.
target_update_frequency: How often to update the target network.
train_frequency: How often to train the DQN.
gamma: Discount factor.
lr: Learning rate used for optimizer.
eps_start: Starting value for epsilon (linear annealing).
eps_end: Final value for epsilon (linear annealing).
anneal_length: How many steps to anneal epsilon for.
n_actions: The number of actions can easily be accessed with env.action_space.n, but we do
    some manual engineering to account for the fact that Pong has duplicate actions.
"""

# Hyperparameters for CartPole-v0
CartPole = {
    'memory_size': 50000,
    'n_episodes': 10000,
    'batch_size': 32,
    'target_update_frequency': 100,
    'train_frequency': 1,
    'gamma': 0.95,
    'lr': 1e-4,
    'eps_start': 1.0,
    'eps_end': 0.05,
    'anneal_length': 10**4,
    'n_actions': 2,
}

Pong = {
    'obs_stack_size': 4,
    'memory_size': 10000,
    'n_episodes': 200,  # 10000,
    'batch_size': 128, # 32,
    'target_update_frequency': 5000, # 1000,
    'train_frequency': 4,
    'gamma': 0.99,
    'lr': 1e-4,
    'eps_start': 1.0,
    'eps_end': 0.01,
    'anneal_length': 10**6,
    'n_actions': 2,  # 2 if we do action mapping, otherwise 6
}

In [5]:
#util.py

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess(obs, env):
    """Performs necessary observation preprocessing."""
    if env in ['CartPole-v0']:
        return torch.tensor(obs, device=device).float() #.unsqueeze(0)
    elif env in ['Pong-v0']:
        env_config = Pong
        obs = torch.tensor(obs, device=device).float().unsqueeze(0)
        #initialize the frame stack
        obs_stack = torch.cat(env_config["obs_stack_size"]  * [obs]).to(device) # .unsqueeze(0)
        return obs_stack
    else:
        raise ValueError('Please add necessary observation preprocessing instructions to preprocess() in utils.py.')

def add_new_obs_to_stack(obs, obs_stack, env):
    if env in ['Pong-v0']:
        env_config = Pong
        obs = torch.tensor(obs[None, :], device=device).float() # .unsqueeze(0)
        #updating the frame stack
        next_obs_stack = torch.cat((obs_stack[1:,:, ...], obs), dim=0).to(device) # .unsqueeze(1)
        return next_obs_stack

In [6]:
#dqn.py

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Store Transitions
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def __len__(self):
        return len(self.memory)

    def push(self, obs, action, next_obs, reward):
        if len(self.memory) < self.capacity:
            self.memory.append(None)

        self.memory[self.position] = (obs, action, next_obs, reward)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """
        Samples batch_size transitions from the replay memory and returns a tuple
            (obs, action, next_obs, reward)
        """
        sample = random.sample(self.memory, batch_size)
        return tuple(zip(*sample))


#steps_done = 0

class DQN(nn.Module):
    def __init__(self, env_config):
        super(DQN, self).__init__()

        # Save hyperparameters needed in the DQN class.
        self.batch_size = env_config["batch_size"]
        self.gamma = env_config["gamma"]
        self.eps_start = env_config["eps_start"]
        self.eps_end = env_config["eps_end"]
        self.anneal_length = env_config["anneal_length"]
        self.n_actions = env_config["n_actions"]
        self.steps_done = 0
        
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        self.fc1 = nn.Linear(3136, 512)
        self.fc2 = nn.Linear(512, self.n_actions)

        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        self.cnn = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )
        self.fc = nn.Sequential(
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
        )


    #def forward(self,x):
    #  return self.fc(self.cnn(x))

    def forward(self, x):
        # input currently: [4, 84, 84]

        #print("Start of forward: ", x.shape)
        x = x.view(-1, 4, 84, 84)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = x.view(-1, 3136)
        # x = x.flatten()
        x = self.fc1(x)
        x = self.fc2(x) 
        x = x.view(self.n_actions,-1)
        return x


    def act(self, observation, exploit=False):
        """Selects an action with an epsilon-greedy exploration strategy."""
        # TODO: Implement action selection using the Deep Q-network. This function
        #       takes an observation tensor and should return a tensor of actions.
        #       For example, if the state dimension is 4 and the batch size is 32,
        #       the input would be a [32, 4] tensor and the output a [32, 1] tensor.
        # TODO: Implement epsilon-greedy exploration.
        #global steps_done
        action = []
        #action = 0
        if exploit:
          next_action = self.forward(observation).max(0)[1]+2
          action.append(next_action)
          #print("exploit gets the action",action)
          return torch.tensor(action)

        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp(-1. * self.steps_done / 200)
        self.steps_done += 1

        #for state in observation:
        #    if random.random() <= eps_threshold:
        #        action = random.randint(0, self.n_actions)
        #        #action = random.randint(2, 3)
        #        #action_index = random.randint(2, 3)
        #        action.append(action)
        #        #print("random action: ", action_index)
        #    else:
        #        # state = state[None, :]
        #        #action = self.forward(state).max(0)[1]+2
        #        # ac = self.forward(state).max(0)[1].item()+2
        #        # action.append(ac)
        #        action.append(self.forward(observation).max(0)[1])
        #        #print("chosen action: ", ac)
        #print(action)

        if random.random() <= eps_threshold:
              
              action.append(random.randint(2,3))
              #print("random action",action)
        else:
              next_action = self.forward(observation).max(0)[1]+2
              action.append(next_action)
              #print("no random",action)

        return torch.tensor(action)


def optimize(dqn, target_dqn, memory, optimizer):
    """This function samples a batch from the replay buffer and optimizes the Q-network."""
    # If we don't have enough transitions stored yet, we don't train.
    if len(memory) < dqn.batch_size:
        return

    #print("optimizing")


    # TODO: Sample a batch from the replay memory and concatenate so that there are
    #       four tensors in total: observations, actions, next observations and rewards.
    #       Remember to move them to GPU if it is available, e.g., by using Tensor.to(device).
    #       Note that special care is needed for terminal transitions!

    #print(memory.memory[0])
    #print("Obs shape: ", memory.memory[0][0].shape)
    #print("action: ", memory.memory[0][1])
    #print("Next obs shape", memory.memory[0][2].shape)
    transitions = memory.sample(dqn.batch_size)
    Transition = namedtuple('Transition',
                            ('obs', 'action', 'next_obs', 'reward'))
    
    batch = Transition(*transitions)
    non_final_mask = torch.tensor(tuple(map(lambda s: not isinstance(s, numpy.ndarray),
                                            batch.next_obs)), device=device, dtype=torch.bool)
    
    non_final_next_states = torch.cat([s for s in batch.next_obs if not isinstance(s, numpy.ndarray)])
    state_batch = torch.cat(batch.obs).to(device)
    action_batch = torch.cat(batch.action).to(device)
    reward_batch = torch.cat(batch.reward).to(device)
    # TODO: Compute the current estimates of the Q-values for each state-action
    #       pair (s,a). Here, torch.gather() is useful for selecting the Q-values
    #       corresponding to the chosen actions.
    #print("state_batch shape: ", state_batch.shape)
    q_values = dqn(state_batch).gather(1, action_batch.view(1,dqn.batch_size))


    # TODO: Compute the Q-value targets. Only do this for non-terminal transitions!
    next_state_values = torch.zeros(dqn.batch_size, device=device)
    #print(target_dqn(non_final_next_states).detach().max(0)[0].shape)
    next_state_values[non_final_mask] = target_dqn(non_final_next_states).detach().max(0)[0]
    # Compute the expected Q values
    q_value_targets = (next_state_values * dqn.gamma) + reward_batch
    # Compute loss.
    loss = F.mse_loss(q_values.squeeze(), q_value_targets)
    # Perform gradient descent.
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()
    return loss.item()

In [9]:
#train.py

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

"""
parser = argparse.ArgumentParser()
parser.add_argument('--env', choices=['CartPole-v0','Pong-v0'])
parser.add_argument('--evaluate_freq', type=int, default=25, help='How often to run evaluation.', nargs='?')
parser.add_argument('--evaluation_episodes', type=int, default=5, help='Number of evaluation episodes.', nargs='?')
"""

args = { "evaluate_freq": 25, "evaluation_episodes":5, "env":'Pong-v0' }

# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
    'CartPole-v0': CartPole,
    'Pong-v0' : Pong,
}

# Initialize environment and config.
env = gym.make(args["env"]).unwrapped
env_config = ENV_CONFIGS["Pong-v0"]

#Preprocessing the environment and scaling observations to [0,1]
env  =  gym.wrappers.AtariPreprocessing (env, screen_size=84 , grayscale_obs=True , frame_skip=1 , noop_max=30, scale_obs=True )
#env = gym.wrappers.FrameStack(env, 4)

# Initialize deep Q-networks.
dqn = DQN(env_config=env_config).to(device)
target_dqn = DQN(env_config=env_config).to(device)
target_dqn.load_state_dict(dqn.state_dict())
target_dqn.eval()

#print(dqn.parameters())

# Create replay memory.
memory = ReplayMemory(env_config['memory_size'])

# Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

# Keep track of best evaluation mean return achieved so far.
best_mean_return = -float("Inf")

mean_return_history = []

for episode in range(env_config['n_episodes']):
    print("Episode ",episode)
    steps = 0
    done = False
    #obs = preprocess(env.reset(), env=args["env"])
    obs_stack = preprocess(env.reset(), env=args["env"])
    #obs_stack = obs_stack[None, :]
    #print("obs_stack: ", obs_stack.shape)

    steps = 0
    while not done:
        steps += 1
        #print("step: ", steps)
        action = dqn.act(obs_stack)
        #print("action: ", action)

        # Act in the true environment.
        # obs_old = obs
        obs_stack_old = obs_stack
        
        #obs, reward, done, info = env.step(action.item())
        obs, reward, done, info = env.step(action.item())
        if (reward > 0):
          print("reward: ", reward)
        # Preprocess incoming observation.
        if not done:
            #obs = obs[None, :]
            #print("obs: ", obs.shape)
            obs_stack = add_new_obs_to_stack(obs, obs_stack, env=args["env"])
        # TODO: Add the transition to the replay memory. Remember to convert
        #       everything to PyTorch tensors!
        reward = torch.tensor([reward], device=device)
        memory.push(obs_stack_old, action, obs_stack, reward)

        # TODO: Run DQN.optimize() every env_config["train_frequency"] steps.
        if steps % env_config["train_frequency"] == 0:
            optimize(dqn, target_dqn, memory, optimizer)

        # TODO: Update the target network every env_config["target_update_frequency"] steps.
        if steps % env_config["target_update_frequency"] == 0:
            target_dqn.load_state_dict(dqn.state_dict())
    
    # Evaluate the current agent.
    if episode % args["evaluate_freq"] == 0:
        print("saving before eval...............................................")
        torch.save(dqn, f'{args["env"]}_preval.pt')
        mean_return = evaluate_policy(dqn, env, env_config, args, n_episodes=args["evaluation_episodes"],verbose = True)
        mean_return_history.append(mean_return)

        """if episode % args["evaluate_freq"] == 0:
            f = plt.figure()
            plt.plot(range(0, len(mean_return_history) * args["evaluate_freq"], args["evaluate_freq"]), mean_return_history)
            plt.xlabel("Episode")
            plt.ylabel("Mean return")
            plt.title("Mean return over episodes")
            f.savefig(f'/content/drive/My Drive/Reinforcement_Learning_Atari/models_Pong/{args["env"]}_model_tuf_{env_config["target_update_frequency"]}_episode_{episode}.png')"""

        print(f'Episode {episode}/{env_config["n_episodes"]}: {mean_return}')

        # Save current agent if it has the best performance so far.
        if mean_return >= best_mean_return:
            best_mean_return = mean_return

            print('Best performance so far! Saving model.')
            torch.save(dqn, f'{args["env"]}_best.pt')

# Close environment after training is completed.
env.close()

  f"The environment {id} is out of date. You should consider "


Episode  0
saving before eval...............................................
This is episode number 0
Evaluated episode 1 with a total return of -21.0
This is episode number 1
Evaluated episode 2 with a total return of -21.0
This is episode number 2
Evaluated episode 3 with a total return of -21.0
This is episode number 3
Evaluated episode 4 with a total return of -21.0
This is episode number 4
Evaluated episode 5 with a total return of -21.0
Episode 0/200: -21.0
Best performance so far! Saving model.
Episode  1
Episode  2
Episode  3
Episode  4
Episode  5
Episode  6
Episode  7
Episode  8
Episode  9
Episode  10
Episode  11
Episode  12
Episode  13
Episode  14
Episode  15
Episode  16
Episode  17
Episode  18
Episode  19
Episode  20
Episode  21
Episode  22
Episode  23
Episode  24
Episode  25
saving before eval...............................................
This is episode number 0
Evaluated episode 1 with a total return of -21.0
This is episode number 1
Evaluated episode 2 with a total retu

In [8]:
#evaluate.py

def evaluate_policy(dqn, env, env_config, args, n_episodes, render=False, verbose=False):
    """Runs {n_episodes} episodes to evaluate current policy."""
    total_return = 0

    for i in range(n_episodes):
        #obs = preprocess(env.reset(), env=args["env"]) #.unsqueeze(0)
        obs_stack = preprocess(env.reset(), env=args["env"]) #.unsqueeze(0)
        done = False
        episode_return = 0

        while not done:
            if render:
                frame = env.render(mode="rgb_array")
                plt.imshow(frame)
                plt.show() 
                display.clear_output(wait=True)
            action = dqn.act(obs_stack, exploit=True)
            #print("action: ", action)

            obs, reward, done, info = env.step(action.item())
            
            if reward>0:
                print("reward: ", reward)
            #obs = preprocess(obs, env=args["env"])#.unsqueeze(0)
            if not done:
                obs_stack = add_new_obs_to_stack(obs, obs_stack, env=args["env"])
            episode_return += reward
        
        total_return += episode_return
        
        if verbose:
            print(f'Evaluated episode {i+1} with a total return of {episode_return}')

    
    return total_return / n_episodes

In [None]:
import matplotlib.pyplot as plt
from IPython import display
from gym import wrappers

In [None]:
dqn = torch.load('Pong-v0_preval.pt', map_location=torch.device(device))
env = gym.make('Pong-v0')
env = gym.wrappers.AtariPreprocessing (env, screen_size=84 , grayscale_obs=True , frame_skip=1 , noop_max=30, scale_obs=True )
env.reset()
env_config = ENV_CONFIGS["Pong-v0"]
args = {"evaluate_freq": 25, "evaluation_episodes": 5, "env":'Pong-v0' }
evaluate_policy(dqn, env, env_config, args, 1, render=True, verbose=True)

KeyboardInterrupt: ignored