<a href="https://colab.research.google.com/github/yeye-cyber/DL-project/blob/master/problem_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install gym[box2d]




Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:

from torch.nn.modules.linear import Linear
from torch.nn.modules.activation import ReLU
import numpy as np
import gym
import torch
import matplotlib.pyplot as plt
from tqdm import trange
from collections import deque, namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import random



class Agent(object):
    ''' Base agent class, used as a parent class

        Args:
            n_actions (int): number of actions

        Attributes:
            n_actions (int): where we store the number of actions
            last_action (int): last action taken by the agent
    '''
    def __init__(self, n_actions: int):
        self.n_actions = n_actions
        self.last_action = None

    def forward(self, state: np.ndarray):
        ''' Performs a forward computation '''
        pass

    def backward(self):
        ''' Performs a backward pass on the network '''
        pass


class RandomAgent(Agent):
    ''' Agent taking actions uniformly at random, child of the class Agent'''
    def __init__(self, n_actions: int):
        super(RandomAgent, self).__init__(n_actions)

    def forward(self, state: np.ndarray) -> int:
        ''' Compute an action uniformly at random across n_actions possible
            choices

            Returns:
                action (int): the random action
        '''
        self.last_action = np.random.randint(0, self.n_actions)
        return self.last_action


class DQN(nn.Module):
      def __init__(self, input_size, output_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
      def forward(self, state):
        ''' Performs a forward computation '''
        return self.network(state)
        
   
class DQNAgent(object):
    ''' Base agent class, used as a parent class

        Args:
            n_actions (int): number of actions

        Attributes:
            n_actions (int): where we store the number of actions
            last_action (int): last action taken by the agent
    '''
    def __init__(self, n_actions: int):
        self.n_actions = n_actions
        self.last_action = None

    def forward(self, state: np.ndarray):
        ''' Performs a forward computation '''
        pass

    def backward(self):
        ''' Performs a backward pass on the network '''
        pass
        
        


Experience = namedtuple('Experience',
                        ['state', 'action', 'reward', 'next_state', 'done'])

class ExperienceReplayBuffer(object):
    """ Class used to store a buffer containing experiences of the RL agent.
    """
    def __init__(self, maximum_length):
        # Create buffer of maximum length
        self.buffer = deque(maxlen=maximum_length)

    def append(self, experience):
        # Append experience to the buffer
        self.buffer.append(experience)

    def __len__(self):
        # overload len operator
        return len(self.buffer)

    def sample_batch(self, n):
        """ Function used to sample experiences from the buffer.
            returns 5 lists, each of size n. Returns a list of state, actions,
            rewards, next states and done variables.
        """
        # If we try to sample more elements that what are available from the
        # buffer we raise an error
        if n > len(self.buffer):
            raise IndexError('Tried to sample too many elements from the buffer!')

        # Sample without replacement the indices of the experiences
        # np.random.choice takes 3 parameters: number of elements of the buffer,
        # number of elements to sample and replacement.
        indices = np.random.choice(
            len(self.buffer),
            size=n,
            replace=False
        )

        # Using the indices that we just sampled build a list of chosen experiences
        batch = [self.buffer[i] for i in indices]

        # batch is a list of size n, where each element is an Experience tuple
        # of 5 elements. To convert a list of tuples into
        # a tuple of list we do zip(*batch). In this case this will return a
        # tuple of 5 elements where each element is a list of n elements.
        return zip(*batch)

def running_average(x, N):
    ''' Function used to compute the running average
        of the last N elements of a vector x
    '''
    if len(x) >= N:
        y = np.copy(x)
        y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
    else:
        y = np.zeros_like(x)
    return y

def linear_decay(eps_min, eps_max, k, N_episodes):
  Z = N_episodes*0.9
  return max(eps_min,eps_max - ((eps_max-eps_min)*(k-1))/(Z-1))


# Import and initialize the discrete Lunar Laner Environment
env = gym.make('LunarLander-v2')
env.reset()

# Parameters
N_episodes = 1000                         # Number of episodes
discount_factor = 0.99                    # Value of the discount factor
n_ep_running_average = 50                    # Running average of 50 episodes
n_actions = env.action_space.n               # Number of available actions
dim_state = len(env.observation_space.high)  # State dimensionality
eps_max = 0.99
eps_min = 0.05
N = 64
lr=0.001
buffer_size = 20000
target_update = round(buffer_size/N)
# We will use these variables to compute the average episodic reward and
# the average number of steps per episode
episode_reward_list = []       # this list contains the total reward per episode
episode_number_of_steps = []   # this list contains the number of steps per episode

buffer = ExperienceReplayBuffer(maximum_length=buffer_size)
#print(dim_state)
#print(n_actions)
# Random agent initialization
agent_random = RandomAgent(n_actions)
agent_DQN = DQNAgent(n_actions)

main_network = DQN(dim_state, n_actions)
target_network = DQN(dim_state, n_actions)
target_network.load_state_dict(main_network.state_dict())
optimizer = optim.Adam(main_network.parameters(), lr)

### Training process

# trange is an alternative to range in python, from the tqdm library
# It shows a nice progression bar that you can update with useful information
EPISODES = trange(N_episodes, desc='Episode: ', leave=True)

for i in EPISODES:
  
    # Reset enviroment data and initialize variables
    done = False
    state = env.reset()
    total_episode_reward = 0
    t = 0
    eps = linear_decay(eps_min, eps_max, i+1, N_episodes)
    while not done:
        
        # Take epsilon-greedy action
        if random.random() < eps:
          action = agent_random.forward(state)
        else:
           state_tensor = torch.tensor([state],
                                    requires_grad=False,
                                    dtype=torch.float32)
           values = main_network(state_tensor)
           action = values.max(1)[1].item()

        
        # Get next state and reward.  The done variable
        # will be True if you reached the goal position,
        # False otherwise
        next_state, reward, done, _ = env.step(action)
        exp = Experience(state, action, reward, next_state, done)
        buffer.append(exp)
        if len(buffer) >= buffer_size *0.3:

          states, actions, rewards, next_states, dones = buffer.sample_batch(
                N)
          actions = torch.tensor([actions],
                            requires_grad=False,
                            dtype=torch.int64)
          values = main_network(torch.tensor(states,
                            requires_grad=True,
                            dtype=torch.float32)).gather(1, actions)
          next_values = target_network(torch.tensor(next_states,
                            requires_grad=False,
                            dtype=torch.float32))
          rewards = torch.tensor([rewards],
                            requires_grad=False,
                            dtype=torch.float32)
          target_values = rewards + discount_factor * next_values.max(1)[0]
          
          for j in range(len(next_values)):
            if dones[j] == True:
              target_values[0][j] = rewards[0][j]
          
          
          
          
          

        # Compute loss function
          loss = nn.functional.mse_loss(
                            values,target_values)

        # Compute gradient
          optimizer.zero_grad()
          loss.backward()

        # Clip gradient norm to 1
          nn.utils.clip_grad_norm_(main_network.parameters(), max_norm=1.)

        # Perform backward pass (backpropagation)
          optimizer.step()
      

        # Update episode reward
        total_episode_reward += reward

        # Update state for next iteration
        state = next_state
        t+= 1
        if  t % target_update == 0:
            print(t)
            target_network.load_state_dict(main_network.state_dict())

    # Append episode reward and total number of steps
    episode_reward_list.append(total_episode_reward)
    episode_number_of_steps.append(t)

    # Close environment
    env.close()

    # Updates the tqdm update bar with fresh information
    # (episode number, total reward of the last episode, total number of Steps
    # of the last episode, average reward, average number of steps)
    EPISODES.set_description(
        "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{}".format(
        i, total_episode_reward, t,
        running_average(episode_reward_list, n_ep_running_average)[-1],
        running_average(episode_number_of_steps, n_ep_running_average)[-1]))




# Plot Rewards and steps
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
ax[0].plot([i for i in range(1, N_episodes+1)], episode_reward_list, label='Episode reward')
ax[0].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_reward_list, n_ep_running_average), label='Avg. episode reward')
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Total reward')
ax[0].set_title('Total Reward vs Episodes')
ax[0].legend()
ax[0].grid(alpha=0.3)

ax[1].plot([i for i in range(1, N_episodes+1)], episode_number_of_steps, label='Steps per episode')
ax[1].plot([i for i in range(1, N_episodes+1)], running_average(
    episode_number_of_steps, n_ep_running_average), label='Avg. number of steps per episode')
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Total number of steps')
ax[1].set_title('Total number of steps vs Episodes')
ax[1].legend()
ax[1].grid(alpha=0.3)
plt.show()





  deprecation(
  deprecation(
  state_tensor = torch.tensor([state],
Episode 125 - Reward/Steps: -426.0/110 - Avg. Reward/Steps: -235.8/100:  13%|█▎        | 126/1000 [00:19<02:17,  6.34it/s]


KeyboardInterrupt: ignored