In [1]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
### MLP
class Multi_Layer_Perceptron(nn.Sequential):
    def __init__(self, input_dim, intern_dim, output_dim, depth = 2, isBiased = False):
        
        dict = OrderedDict([("input",nn.Linear(input_dim,intern_dim, bias=isBiased))])
        for i in range(depth):
            dict.update({str(i) : nn.Linear(intern_dim,intern_dim,bias=isBiased)})
        dict.update({"output" : nn.Linear(intern_dim,output_dim,bias=isBiased)})

        super().__init__(dict)

        self.reset_init_weights_biases(0) # so that we do not use a default initialization

    def reset_init_weights_biases(self, norm = -1):
        for layer in self.children():
            if norm == -1:
                stdv = 1. / math.sqrt(layer.weight.size(1))
            else :
                stdv = norm
            layer.weight.data.fill_(stdv)
            if layer.bias is not None:
                layer.bias.data.fill_(stdv)

In [3]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [4]:
class RandomAgent(Agent):
    def observe(self, state, action, next_state, reward):
        pass
        
    def select_action(self, state):
        return random.randint(0,1)
        
        
    def update(self):
        pass

    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if(debug_mode) : print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            
            while not done:
                                        
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                self.observe(state,action,next_state,reward)
                self.update()

                episode_reward += reward
                state = next_state
                done = terminated or truncated

            episodesHistory[i] = episode_reward
            if(debug_mode) : print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
        print(episodesHistory[:])

In [5]:
class DQNAgent(Agent):

    def __init__(self, id, epsilonMax, epsilonMin = 0.05, Q = Multi_Layer_Perceptron(input_dim = 3,intern_dim = 64, output_dim = 1, depth = 2, isBiased = False), env = gym.make('MountainCar-v0'), arrayNewPosReward = None, arrayNewVelReward = None, contReward = False, gamma = 0.99, replay_buffer_SIZE = 10000, batch_size = 64, observation_SIZE = 6, optimizer = torch.optim.AdamW):
        Agent.__init__(self,id,env)
        self.Q = Q
        self.QTarget = Q
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.replay_buffer = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.batch_size = batch_size
        self.optimizer = optimizer(self.Q.parameters())
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.arrayNewPosReward = arrayNewPosReward
        self.arrayNewVelReward = arrayNewVelReward
        self.contReward = contReward
        
    def observe(self, state, action, next_state, reward):
        state = np.array(state)
        action = np.array([action])
        next_state = np.array(next_state)
        reward = np.array([reward])

        concatenatation = np.concatenate((state, action, next_state, reward))
        return concatenatation
        
    def select_action(self, state):
        P = random.uniform(0,1)
        a=0
        if P <= 1-self.epsilon :
            A=np.zeros(3)
            o = []
            ArgQmax = 0
            for k in range(3):
                A[k] = self.Q(torch.from_numpy(np.concatenate((np.array(state), np.array([k])))).to(torch.float32))
            a = np.argmax(A)
            for k in range(3):
                if A[k] == A[a]:
                    o.append(k)
            a =random.choice(o)
                
        else:
            a = random.randint(0,2)
        return a
        
        
    def update(self,j):

        LossFct = torch.nn.MSELoss(reduction='mean')
        #print(min(j,self.replay_buffer.shape[0]))
        batch = np.random.choice(min(j,self.replay_buffer.shape[0]), self.batch_size)
        target = torch.zeros((self.batch_size))
        input = torch.zeros((self.batch_size))

        for i in range(self.batch_size):

            if self.replay_buffer[batch[i],3] <0.5:
                A0 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([0])))).to(torch.float32)
                A1 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([1])))).to(torch.float32)
                A2 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([2])))).to(torch.float32)
                target[i] = self.replay_buffer[batch[i],5] + self.gamma*max(self.QTarget(A0),self.QTarget(A1),self.QTarget(A2))
            else:
                target[i] = self.replay_buffer[batch[i],5]
                
            input[i] = self.Q(torch.from_numpy(self.replay_buffer[batch[i],:3]).to(torch.float32))

        loss = LossFct(input, target)
        for param in self.Q.parameters():
            param.grad = None
            
        loss.backward()

        grad = 0
        for layer in self.Q.children():
            grad += layer.weight.grad.mean()
        
        self.optimizer.step()
        return loss.item(), abs(grad)

    def customReward(self, state, action, currentReward, next_state, uniqueReward, excludePassivity):
        reward = -1

        if excludePassivity and action == 1:
            reward -= 2

        if self.contReward:
            reward = (abs(next_state[0] + 0.5)**2)/200
        
        if self.arrayNewPosReward.all() != None:
            for k in range(self.arrayNewPosReward.shape[0]):
                #print(i)
                i = self.arrayNewPosReward[k,0]
                if (i <= next_state[0] and i +0.5 >=0) or (i >= next_state[0] and i +0.5 <=0):
                    if self.arrayNewPosReward[k,2] == 0:
                        if self.arrayNewPosReward[k,1] > reward:
                            reward = self.arrayNewPosReward[k,1]
                            if uniqueReward: self.arrayNewPosReward[k,2] = 1
                        
        if self.arrayNewVelReward.all() != None:
            for k in range(self.arrayNewVelReward.shape[0]):
                i = self.arrayNewVelReward[k,0]
                if (i < state[1] and i>0 and action == 2) or (i > state[1] and i<0 and action == 0):
                    reward += self.arrayNewVelReward[k,1]
                    #print(state[1])
                    #print(action)
                    
        return reward

    def play(self):
        self.epsilon = 0
        newSeed = random.randint(0,100000)
        state,_ = self.env.reset(seed = newSeed)
        done = False
                    
        while done == False:
                                        
            action = self.select_action(state)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
        self.env.close()
    
    def train(self, episodes,lossLim = 0, limitStep = 200, refreshTarget = 200, refreshQ = 1, buffer_fill = True, epsilonDecreasing =100, debug_mode=False, recap_mode=False, reset_init = False, epsilon_decrease = True, uniqueReward = False, excludePassivity = True):
        episodesHistory = np.zeros((episodes))
        rewardHistory = np.zeros((episodes*limitStep))
        lossHistory = np.zeros((int(episodes*limitStep/refreshQ)))
        gradHistory = np.zeros((int(episodes*limitStep/refreshQ)))
        cumulativeHistory = np.zeros((episodes))
        self.QTarget = self.Q
        if reset_init != False: self.Q.reset_init_weights_biases(reset_init)
        j=0
        self.replay_buffer = np.zeros((self.replay_buffer_SIZE, self.observation_SIZE))
        k=0
        for e in range(episodes):
            l=0
            terminated = False
            if debug_mode: print("Episode: "+str(e+1)+" starts")
                
            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-e/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            if self.arrayNewPosReward.all() != None:
                for i in range(self.arrayNewPosReward.shape[0]):
                    self.arrayNewPosReward[i,2] = 0

            s=1
            while done == False:
                j+=1               
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                
                if s >= limitStep:
                    truncated = True
                else:
                    truncated = False

                s+=1
                reward = self.customReward(state, action,reward, next_state, uniqueReward, excludePassivity)
                if debug_mode: print("Action "+str(k)+" selected: "+str(action)+" Reward: "+ str(reward))
                observe = self.observe(state,action,next_state,reward)
                
                if k < self.replay_buffer_SIZE: self.replay_buffer[k] = observe
                elif buffer_fill: k=-1
                k+=1
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated
                if j%refreshQ == 0:
                    l,g = self.update(j)
                    lossHistory[int(j/refreshQ)-1] = l
                    gradHistory[int(j/refreshQ)-1] = g
                if j%refreshTarget == 0:
                    print("Loss: "+str(l))
                    self.QTarget = self.Q
                    #for param in self.Q.parameters():
                        #print(param.data)

                rewardHistory[j-1] = reward
                
            if terminated:
                if e > 0:
                    cumulativeHistory[e] = cumulativeHistory[e-1] +1
                else:
                    cumulativeHistory[e] = 1
            else:
                if e > 0:
                    cumulativeHistory[e] = cumulativeHistory[e-1]
                else:
                    cumulativeHistory[e] = 0
                                    
            if debug_mode or recap_mode: print("Episode "+str(e+1)+ " , Reward: "+str(episode_reward)+" Epsilon: "+str(self.epsilon))
            episodesHistory[e] = episode_reward
            
            if l <= lossLim:
                print("Loss reaches limit")
                break
        return episodesHistory, rewardHistory, lossHistory, gradHistory, cumulativeHistory

In [6]:
discr_step = np.array([0.025, 0.005])
env = gym.make('MountainCar-v0')
(env.observation_space.high - env.observation_space.low)//discr_step

array([72., 28.])

In [7]:
random.sample([1,2,5], 2)

[2, 1]

In [51]:
import scipy.sparse as sp

In [138]:

# cf code cours, après avoir obtenu un modèle de l'environnement, résoudre le porblème d'optimisation avec le dynamic programming

class DynaAgent(Agent):
    
    def __init__(self, id, env=gym.make('MountainCar-v0'), epsilonMax = 0.9, epsilonMin = 0.05, discr_step = np.array([0.025, 0.005]), gamma = 0.99, k=0, alpha=0.2):
        Agent.__init__(self,id,env)
        self.discr_step = discr_step
        self.n_xbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        self.n_vbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        self.n_states = self.n_xbins*self.n_vbins
        self.n_actions = 3
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.k = k
        self.alpha = alpha
        self.visited_state_action = set()
        
        self.N = [sp.lil_array((self.n_actions, self.n_states)) for _ in range(self.n_states)]
        
        self.P = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = torch.rand(size=(self.n_states,))
                self.P[i, j, :] = random/random.sum()

        
        self.R = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.R[state, :] = 0

        self.R = sp.lil_array(self.R)
        
        """
        self.W = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.W[state, :] = 0
        """
        
        self.Q = sp.lil_array(np.zeros(shape=(self.n_states, self.n_actions)))
    

    # On obtient les s du (s, a, s') en discrétisant (cf plus haut), puis pour les s' on utilise une loi uniforme pour chaque paire (s, a)

    def discretize_x(self, x):
        x_bin = np.round(((x - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        return x_bin*(self.n_vbins) 

    def discretize_v(self, v):
        v_bin = np.round(((v - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        return v_bin 

    def discretize(self, state):
        x_bin = self.discretize_x(state[0])
        v_bin = self.discretize_v(state[1])
        return x_bin + v_bin

    
   
    def update(self, state, action, next_state, reward):
        
        discr_state, discr_next_state = self.discretize(state), self.discretize(next_state)

        self.N[discr_state][action, discr_next_state] += 1

        
        total_visited = self.N[discr_state].getrow(action).sum()
        if total_visited > 0:
            self.P[discr_state, action, discr_next_state] = self.N[discr_state][action,  discr_next_state] / total_visited

        
        #self.W[discr_state, action] += reward
        self.R[discr_state, action] = (1-self.alpha)*self.R[discr_state, action] + self.alpha*reward
        
        Q_dense = self.Q.toarray()
        Q_dense[discr_state, action] = self.R[discr_state, action] + (self.gamma)*np.array([(self.P[discr_state, action, discr_next_local])*(max(Q_dense[discr_next_local, :])) for discr_next_local in range(self.n_states)]).sum()
        self.Q = sp.lil_matrix(Q_dense)

        self.k = len(self.visited_state_action) // 10
        sampled_states = []
        if self.k >= 1:
            sampled_states = random.sample(list(self.visited_state_action), min(self.k, len(self.visited_state_action)))

            for (random_state, random_action) in sampled_states:
            
                Q_dense = self.Q.toarray()
                Q_dense[random_state, random_action] = self.R[random_state, random_action] + (self.gamma)*np.array([(self.P[random_state, random_action, discr_next_local])*(max(Q_dense[discr_next_local, :])) for discr_next_local in range(self.n_states)]).sum()
                self.Q = sp.lil_matrix(Q_dense)
               

      

    def observe(self):
        pass
    
    def select_action(self, state):
        state_bin = self.discretize(state)
        p = random.uniform(0,1)
        a=0
        if p < 1-self.epsilon :
            a = np.argmax(self.Q.getrow(state_bin))
        else:
            a = random.randint(0,2)
            
        return a


    """
    # Reinitialisation de R nécessaire ?
    def reset_for_episode(self):
        self.N_episode = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        
        self.W_episode = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.W_episode[state, :] = 0

    """  

    
    def train(self, episodes, debug_mode=True, epsilon_decrease=True, epsilonDecreasing=100):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if debug_mode: print("Episode: "+str(i+1)+" starts")


            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-np.e/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            
            done = False
            episode_reward = 0
            k=0
            
            while not done:
            
                action = self.select_action(state)
                #if debug_mode: print("Action :"+str(k)+" selected: "+str(action))
                #print(self.N_episode.sum())
                k+=1
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                self.update(state, action, next_state, reward)
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated

            #self.reset_for_episode()

             
            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
            episodesHistory[i] = episode_reward
        return episodesHistory[:]

In [139]:
A = DynaAgent(1)

In [136]:
N = np.ones(shape=(10, 5))

print(np.max(sp.csr_matrix(N)[0, :]))


1.0


In [93]:
print(sp.lil_array(N).getrow(0))


  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0


In [None]:
A.train(2000)

Episode: 1 starts
Episode 1 , Reward: -200.0
Episode: 2 starts
Episode 2 , Reward: -200.0
Episode: 3 starts
Episode 3 , Reward: -200.0
Episode: 4 starts
Episode 4 , Reward: -200.0
Episode: 5 starts
Episode 5 , Reward: -200.0
Episode: 6 starts
Episode 6 , Reward: -200.0
Episode: 7 starts
Episode 7 , Reward: -200.0
Episode: 8 starts
Episode 8 , Reward: -200.0
Episode: 9 starts
Episode 9 , Reward: -200.0
Episode: 10 starts
Episode 10 , Reward: -200.0
Episode: 11 starts
Episode 11 , Reward: -200.0
Episode: 12 starts
Episode 12 , Reward: -200.0
Episode: 13 starts
Episode 13 , Reward: -200.0
Episode: 14 starts
Episode 14 , Reward: -200.0
Episode: 15 starts
Episode 15 , Reward: -200.0
Episode: 16 starts
Episode 16 , Reward: -200.0
Episode: 17 starts
Episode 17 , Reward: -200.0
Episode: 18 starts
Episode 18 , Reward: -200.0
Episode: 19 starts
Episode 19 , Reward: -200.0
Episode: 20 starts
Episode 20 , Reward: -200.0
Episode: 21 starts
Episode 21 , Reward: -200.0
Episode: 22 starts
Episode 22 ,

In [None]:
# Code Prof

def value_iteration(self, theta=0.001):
    """
    P : 3D array representing transition probabilities, P[s,a,s'] is the probability of transitioning from s to s' under action a.
    R : 2D array representing rewards for each state-action pair
    gamma : discount factor
    theta : stopping criterion
    """
    V = np.zeros(self.n_states)

    while True:
        delta = 0
        for s in range(self.n_states):
            v = V[s]
            # Calculate the value of each action in the current state
            action_values = np.zeros(self.n_actions)
            for a in range(self.n_actions):
                action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * v))
            # Update the value function
            V[s] = np.max(action_values)
            # Update the change in value function
            delta = max(delta, np.abs(v - V[s]))

        # If the change in value function is smaller than theta, stop
        if delta < theta:
            break
    
    policy = np.zeros(self.n_states, dtype=int)
    for s in range(self.n_states):
        # Calculate the value of each action in the current state
        action_values = np.zeros(self.n_actions)
        for a in range(self.n_actions):
            action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * V))
        # Choose the action with the maximum value
        policy[s] = np.argmax(action_values)
    
    return policy, V