In [150]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random

In [151]:
### MLP
class Multi_Layer_Perceptron(nn.Sequential):
    def __init__(self, input_dim, intern_dim, output_dim, depth = 2, isBiased = False):
        
        dict = OrderedDict([("input",nn.Linear(input_dim,intern_dim, bias=isBiased))])
        for i in range(depth):
            dict.update({str(i) : nn.Linear(intern_dim,intern_dim,bias=isBiased)})
        dict.update({"output" : nn.Linear(intern_dim,output_dim,bias=isBiased)})

        super().__init__(dict)

        self.reset_init_weights_biases(0) # so that we do not use a default initialization

    def reset_init_weights_biases(self, norm = None):
        for layer in self.children():
            if norm == None:
                stdv = 1. / math.sqrt(layer.weight.size(1))
            else :
                stdv = norm
            
            layer.weight.data.uniform_(-stdv, stdv)
            if layer.bias is not None:
                layer.bias.data.uniform_(-stdv, stdv)

In [152]:
### CustomLoss DQN
class DQN_Loss(nn.Module):
    def __init__(self):
        super(DQN_Loss, self).__init__()

    def forward(self, input, target):
        loss = (target - input)**2
        return loss

In [153]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [154]:
class RandomAgent(Agent):
    def observe(self, state, action, next_state, reward):
        pass
        
    def select_action(self, state):
        return random.randint(0,1)
        
        
    def update(self):
        pass

    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if(debug_mode) : print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            
            while not done:
                                        
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                self.observe(state,action,next_state,reward)
                self.update()

                episode_reward += reward
                state = next_state
                done = terminated or truncated

            episodesHistory[i] = episode_reward
            if(debug_mode) : print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
        print(episodesHistory[:])

In [155]:
class DQNAgent(Agent):

    def __init__(self, id, epsilon, Q = Multi_Layer_Perceptron(input_dim = 3,intern_dim = 64, output_dim = 1, depth = 2, isBiased = True), env = gym.make('MountainCar-v0'), arrayNewReward = None, gamma = 0.99, replay_buffer_SIZE = 10000, batch_size = 64, observation_SIZE = 6, optimizer = torch.optim.AdamW):
        Agent.__init__(self,id,env)
        self.Q = Q
        self.gamma = gamma
        self.epsilon = epsilon
        self.replay_buffer = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.batch_size = batch_size
        self.optimizer = optimizer(self.Q.parameters())
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.arrayNewReward = arrayNewReward
        
    def observe(self, state, action, next_state, reward):
        state = np.array(state)
        action = np.array([action])
        next_state = np.array(next_state)
        reward = np.array([reward])

        concatenatation = np.concatenate((state, action, next_state, reward))
        return concatenatation
        
    def select_action(self, state):
        P = random.uniform(0,1)
        a=0
        if P < 1-self.epsilon :
            print(self.env.action_space.shape)
            A=np.zeros(3)
            print(A)
            for k in range(A.shape[0]):
                A[k] = self.Q(torch.from_numpy(np.concatenate((np.array(state), np.array([k])))).to(torch.float32))
            a = np.argmax(A)
        else:
            a = random.randint(0,3)
            
        return a
        
        
    def update(self):

        LossFct = DQN_Loss()
        
        batch = np.random.choice(self.replay_buffer.shape[0], self.batch_size)

        for i in range(self.batch_size):

            A0 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i]][3:5], np.array([0])))).to(torch.float32)
            A1 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i]][3:5], np.array([1])))).to(torch.float32)
            target = self.replay_buffer[batch[i]][5] + self.gamma*max(self.Q(A0),self.Q(A1))
            input = self.Q(torch.from_numpy(self.replay_buffer[batch[i]][:3]).to(torch.float32))

            loss = LossFct(input, target)
            
            self.optimizer.zero_grad()
            loss.backward()
            #print(loss.item())
            self.optimizer.step()

    def customReward(currentReward):
        reward = currentReward
        if self.arrayNewReward == None:
            return reward
            
        for i in self.arrayNewReward[:][0]:
            if i == next_state[0]:
                reward = self.arrayNewReward[i][1]
                break
        return reward
        
    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if debug_mode: print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            self.replay_buffer = np.zeros((self.replay_buffer_SIZE, self.observation_SIZE))
            k=0
            
            while done == False:
                                        
                action = self.select_action(state)
                if debug_mode: print("Action "+str(k)+" selected: "+str(action))
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                reward = self.customReward(currentReward = reward)
                observe = self.observe(state,action,next_state,reward)
                self.replay_buffer[k] = observe
                k+=1
                episode_reward += reward
                state = next_state
                done = terminated or truncated

            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
            self.update()
            episodesHistory[i] = episode_reward
        return episodesHistory[:]

In [156]:
class CustomEnv(gym.Env):
    def __init__(self, arrayNewReward, render = None, **kwargs):
        super(CustomEnv, self).__init__(**kwargs)
        self.basicEnv = gym.make('MountainCar-v0', render_mode=render)
        self.arrayNewReward = arrayNewReward

    def reset(self,**kwargs):
        return self.env.reset(**kwargs)

    def step(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        for i in self.arrayNewReward[:][0]:
            if i == next_state[0]:
                reward = self.arrayNewReward[i][1]
                break
        return next_state, reward, terminated, truncated, info 

    def close(self):
        self.env.close()



In [157]:
DQN = DQNAgent(0,0.1, env= CustomEnv(arrayNewReward=np.array([(-0.3,10),(0.3,5)])))
print(DQN.train(1000, debug_mode = False))

AttributeError: 'CustomEnv' object has no attribute 'env'

In [None]:
NewReward=np.array([(-0.3,10),(0.3,5),(0.4,20),(0.6,200)])
DQN = DQNAgent(0,0.1, arrayNewReward = NewReward)
DQN.train(1, debug_mode = True)
DQN.env.close()

In [158]:
DQN.env.close()

AttributeError: 'CustomEnv' object has no attribute 'env'

In [159]:
help(np.zeros)

Help on built-in function zeros in module numpy:

zeros(...)
    zeros(shape, dtype=float, order='C', *, like=None)

    Return a new array of given shape and type, filled with zeros.

    Parameters
    ----------
    shape : int or tuple of ints
        Shape of the new array, e.g., ``(2, 3)`` or ``2``.
    dtype : data-type, optional
        The desired data-type for the array, e.g., `numpy.int8`.  Default is
        `numpy.float64`.
    order : {'C', 'F'}, optional, default: 'C'
        Whether to store multi-dimensional data in row-major
        (C-style) or column-major (Fortran-style) order in
        memory.
    like : array_like, optional
        Reference object to allow the creation of arrays which are not
        NumPy arrays. If an array-like passed in as ``like`` supports
        the ``__array_function__`` protocol, the result will be defined
        by it. In this case, it ensures the creation of an array object
        compatible with that passed in via this argument.



In [160]:
help(np.ceil)

Help on ufunc:

ceil = <ufunc 'ceil'>
    ceil(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])

    Return the ceiling of the input, element-wise.

    The ceil of the scalar `x` is the smallest integer `i`, such that
    ``i >= x``.  It is often denoted as :math:`\lceil x \rceil`.

    Parameters
    ----------
    x : array_like
        Input data.
    out : ndarray, None, or tuple of ndarray and None, optional
        A location into which the result is stored. If provided, it must have
        a shape that the inputs broadcast to. If not provided or None,
        a freshly-allocated array is returned. A tuple (possible only as a
        keyword argument) must have length equal to the number of outputs.
    where : array_like, optional
        This condition is broadcast over the input. At locations where the
        condition is True, the `out` array will be set to the ufunc result.
        Elsewhere, the `out` array will 

In [161]:
discr_step = np.array([0.025, 0.005])
env = gym.make('MountainCar-v0')
(env.observation_space.high - env.observation_space.low)//discr_step

array([72., 28.])

In [162]:
np.multiply.reduce(((env.observation_space.high-np.array([0.1, 0])-env.observation_space.low)//discr_step)).astype(np.int32) + 1

1905

In [163]:

# cf code cours, après avoir obtenu un modèle de l'environnement, résoudre le porblème d'optimisation avec le dynamic programming

class DynaAgent(Agent):

    # comment choisir k ? 
    
    def __init__(self, id, env = gym.make('MountainCar-v0'), discr_step = np.array([0.025, 0.005]), gamma = 0.99, epsilon = 0.9, k=10):
        Agent.__init__(self,id,env)
        self.n_states = np.multiply.reduce(((env.observation_space.high - env.observation_space.low)//self.discr_step)).astype(np.int32)
        self.n_actions = 3
        self.gamma = gamma
        self.epsilon = epsilon
        self.k = k

    # On obtient les s du (s, a, s') en discrétisant (cf plus haut), puis pour les s' on utilise une loi uniforme pour chaque paire (s, a)
    
    @property
    def P(self):
        shape = (self.n_states, self.n_actions, self.n_states) 
        P = np.zeros(shape=shape)
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = torch.rand(size=(self.n_states,))
                P[i, j, :] = random/random.sum()
        return P

    @property
    def N(self):
        shape = (self.n_states, self.n_actions, self.n_states) 
        return np.zeros(shape=shape)


    @property
    def R(self):
        shape = (self.n_states, self.n_actions) 
        return np.zeros(shape=shape)

                     
        
    @property
    def W(self):
        shape = (self.n_states, self.n_actions) 
        W = - np.ones(shape=shape)
        first_terminal_state = np.multiply.reduce(((env.observation_space.high-np.array([0.1, 0])-env.observation_space.low)//discr_step)).astype(np.int32) + 1

        for state in range(first_terminal_state, self.n_states):
            W[state, :] = 0

        return W
                                                 

    @property
    def Q(self):
        shape = (self.n_states, self.n_actions) 
        return np.zeros(shape=shape)





    def discretize(self, state):
        bin_state = np.multiply.reduce(np.round(((state - env.observation_space.low)//self.discr_step))).astype(np.int32)
        return bin_state

    
    def update(self, state, action, next_state, reward):
        discr_state, discr_next_state = self.discretize(state), self.discretize(next_state)
        self.N[discr_state, action, discr_next_state] += 1
        self.P[discr_state, action, discr_next_state] = self.N[discr_state, action, discr_next_state]/(self.N[discr_state, action, :].sum())
        self.W[discr_state, action] += reward
        self.R[discr_state, action] = self.W[discr_state, action]/(self.N[discr_state, action, :].sum())

        self.Q[discr_state, action] = self.R[discr_state, action] + (self.gamma)*np.array([(self.P[discr_state, action, discr_next_local])*max(self.Q[discr_next_local, :]) for discr_next_local in range(self.n_states)]).sum()

        nb=1
        while nb<self.k:
            random_state = torch.randint(0, (self.n_states - 1), size=(1,))
            random_action = torch.randint(0, (self.n_actions - 1), size=(1,))
            if self.N[random_state, random_action, :].sum() > 0:
                self.Q[random_state, random_action] = self.R[random_state, random_action] + (self.gamma)*np.array([(self.P[random_state, random_action, discr_next_local])*max(self.Q[discr_next_local, :]) for discr_next_local in range(self.n_states)]).sum()
                nb+=1

    def observe(self):
        pass
    
    def select_action(self, state):
        state_bin = self.discretize(state)
        P = random.uniform(0,1)
        a=0
        if P < 1-self.epsilon :
            a = np.argmax(Q[state_bin, :])
        else:
            a = random.randint(0,3)
            
        return a

        
    def train(self, episodes, debug_mode=True):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if debug_mode: print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            
            done = False
            episode_reward = 0
            
            while not done:

                k=0
                action = self.select_action(state)
                if debug_mode: print("Action "+str(k)+" selected: "+str(action))
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                self.update(state, action, next_state, reward)
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated

            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
           
            episodesHistory[i] = episode_reward
        return episodesHistory[:]

In [164]:
A = DynaAgent(1)

AttributeError: 'DynaAgent' object has no attribute 'discr_step'

In [None]:
A.train(2)

Episode: 1 starts
Action 0 selected: 2


  self.P[discr_state, action, discr_next_state] = self.N[discr_state, action, discr_next_state]/(self.N[discr_state, action, :].sum())
  self.R[discr_state, action] = self.W[discr_state, action]/(self.N[discr_state, action, :].sum())
