In [1]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import scipy.sparse as sp

In [2]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [3]:
discr_step = np.array([0.025, 0.005])
env = gym.make('MountainCar-v0')
(env.observation_space.high - env.observation_space.low)//discr_step

array([72., 28.])

In [30]:

# cf code cours, après avoir obtenu un modèle de l'environnement, résoudre le porblème d'optimisation avec le dynamic programming

class DynaAgent(Agent):
    
    def __init__(self, id, env=gym.make('MountainCar-v0'), epsilonMax = 0.9, epsilonMin = 0.05, discr_step = np.array([0.025, 0.02]), gamma = 0.99, k=0, k_fixed = True, alpha=0.2, observation_SIZE = 6, replay_buffer_SIZE = 10000):
        Agent.__init__(self,id,env)
        self.discr_step = discr_step
        self.n_xbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        print(self.n_xbins)
        print(self.env.observation_space.high)
        print(self.env.observation_space.low)
        self.n_vbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        print(self.n_vbins)
        self.n_states = self.n_xbins*self.n_vbins
        self.n_actions = 3
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.k = k
        self.k_fixed = k_fixed
        self.alpha = alpha
        '''
        Definition of the replay buffer
        '''
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.visited_state_action_Array = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.visited_state_action = set()
        
        self.N = np.zeros((self.n_states, self.n_actions, self.n_states))
        
        self.P = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = np.random.rand(self.n_states)
                self.P[i, j, :] = random/random.sum()
        
        self.R = - np.ones(shape=(self.n_states, self.n_actions))
        self.terminal_x_bin = self.discretize_x(0.5)*self.n_vbins
        print(self.terminal_x_bin)
        
        self.Q = np.zeros(shape=(self.n_states, self.n_actions))

        self.lossHistory = 0
        self.up_count = 0
    

    # On obtient les s du (s, a, s') en discrétisant (cf plus haut), puis pour les s' on utilise une loi uniforme pour chaque paire (s, a)

    def discretize_x(self, x):
        x_bin = np.round(((x - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        return x_bin

    def discretize_v(self, v):
        v_bin = np.round(((v - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        return v_bin 

    def discretize(self, state):
        x_bin = self.discretize_x(state[0])
        v_bin = self.discretize_v(state[1])
        return x_bin*self.n_vbins + v_bin

    
   
    def update(self, state, action, next_state, reward):
        discr_state, discr_next_state = self.discretize(state), self.discretize(next_state)

        self.visited_state_action.add((discr_state,action))
        self.N[discr_state,action, discr_next_state] += 1

        
        total_visited = self.N[discr_state,action,:].sum()

        if total_visited > 0:
            self.P[discr_state, action, :] = self.N[discr_state, action,  :] / total_visited
            self.R[discr_state, action] = (self.R[discr_state, action]*(total_visited-1) + reward) / total_visited
            
        start = time.time()

        if discr_state < self.terminal_x_bin:
            self.Q[discr_state, action] = reward + (self.gamma)*(self.P[discr_state, action,:]*np.max(self.Q, axis = 1)[:]).sum()
        else:
            self.Q[discr_state, action] = reward

        self.lossHistory[self.up_count]= self.Q[discr_state, action]
        self.up_count += 1
        
        if not self.k_fixed:
            self.k = len(self.visited_state_action) // 10
            print("K changes")
            
        sampled_states = []
        if self.k >= 1:
            sampled_states = random.choices(list(self.visited_state_action), k = self.k)

            for (random_state, random_action) in sampled_states:
                if random_state < self.terminal_x_bin:
                    self.Q[random_state, random_action] = self.R[random_state, random_action] + (self.gamma)*(self.P[random_state, random_action,:]*np.max(self.Q, axis = 1)[:]).sum()
                else:
                    self.Q[random_state, random_action] = self.R[random_state, random_action]

                self.lossHistory[self.up_count]= self.Q[random_state, random_action]
                self.up_count += 1
        #print(time.time() - start)

    def observe(self):
        pass
    
    def select_action(self, state):
        state_bin = self.discretize(state)
        p = random.uniform(0,1)
        a=0
        if p < 1-self.epsilon :
            a = np.argmax(self.Q[state_bin,:])
        else:
            a = random.randint(0,2)
            
        return a
    '''
    Select actions without exploration (for the tests)
    '''  
    def select_best_action(self, state):
        state_bin = self.discretize(state)
        return np.argmax(self.Q[state_bin,:])
    '''
    Test the agent on a seed (random or not) after the training
    '''  
    def play(self, seed = False):
        newSeed = random.randint(0,100000)
        
        if seed != False:
            newSeed = seed
            
        state,_ = self.env.reset(seed = newSeed)
        done = False
                    
        while done == False:
                                        
            action = self.select_best_action(state)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            state = next_state
            done = terminated or truncated
        self.env.close()

    """
    # Reinitialisation de R nécessaire ?
    def reset_for_episode(self):
        self.N_episode = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        
        self.W_episode = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.W_episode[state, :] = 0

    """  

    
    def train(self, episodes, debug_mode=True, epsilon_decrease=True, epsilonDecreasing=100):
        episodesHistory = np.zeros((episodes))
        self.lossHistory = np.zeros((int(episodes*200*(self.k+1))))
        self.up_count = 0
        for i in range(episodes):

            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-i/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            
            done = False
            episode_reward = 0
            
            while not done:
            
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                self.update(state, action, next_state, reward)
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated
                if terminated: print("Terminated")

            #self.reset_for_episode()

             
            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward)+" Epsilon: "+str(self.epsilon))
            episodesHistory[i] = episode_reward
        return episodesHistory

In [31]:
A = DynaAgent("id0", k = 200)
A.train(1000, debug_mode=True)

72
[0.6  0.07]
[-1.2  -0.07]
7
476
Episode 1 , Reward: -200.0 Epsilon: 0.9
Episode 2 , Reward: -200.0 Epsilon: 0.8910448503742513
Episode 3 , Reward: -200.0 Epsilon: 0.8821788059760798
Episode 4 , Reward: -200.0 Epsilon: 0.8734009801936573
Episode 5 , Reward: -200.0 Epsilon: 0.8647104952370909
Episode 6 , Reward: -200.0 Epsilon: 0.8561064820506427
Episode 7 , Reward: -200.0 Epsilon: 0.8475880802258239
Episode 8 , Reward: -200.0 Epsilon: 0.8391544379153535
Episode 9 , Reward: -200.0 Epsilon: 0.8308047117479722
Episode 10 , Reward: -200.0 Epsilon: 0.8225380667441053
Episode 11 , Reward: -200.0 Epsilon: 0.8143536762323635
Episode 12 , Reward: -200.0 Epsilon: 0.8062507217668754
Episode 13 , Reward: -200.0 Epsilon: 0.7982283930454418
Episode 14 , Reward: -200.0 Epsilon: 0.7902858878285052
Episode 15 , Reward: -200.0 Epsilon: 0.7824224118589252
Episode 16 , Reward: -200.0 Epsilon: 0.774637178782552
Episode 17 , Reward: -200.0 Epsilon: 0.7669294100695903
Episode 18 , Reward: -200.0 Epsilon: 0

(array([-200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
      

In [35]:
Dyna_Test = DynaAgent("idTest", env=gym.make('MountainCar-v0', render_mode='human'))
Dyna_Test.Q = A.Q
episodesHistory = Dyna_Test.play(seed = False)

72
[0.6  0.07]
[-1.2  -0.07]
7
476
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
2
0
0
0
0
0
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
1
2
2
1
1
1
1
2
2
2
2
2
2
2
2
2
2
0
2
1
2
1
2
0
2
2
2
2
2
2
2
2
2
1
2
2
1
2
1
2
2
2
2
2
2
2
2
2
2


In [None]:
# Code Prof

def value_iteration(self, theta=0.001):
    """
    P : 3D array representing transition probabilities, P[s,a,s'] is the probability of transitioning from s to s' under action a.
    R : 2D array representing rewards for each state-action pair
    gamma : discount factor
    theta : stopping criterion
    """
    V = np.zeros(self.n_states)

    while True:
        delta = 0
        for s in range(self.n_states):
            v = V[s]
            # Calculate the value of each action in the current state
            action_values = np.zeros(self.n_actions)
            for a in range(self.n_actions):
                action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * v))
            # Update the value function
            V[s] = np.max(action_values)
            # Update the change in value function
            delta = max(delta, np.abs(v - V[s]))

        # If the change in value function is smaller than theta, stop
        if delta < theta:
            break
    
    policy = np.zeros(self.n_states, dtype=int)
    for s in range(self.n_states):
        # Calculate the value of each action in the current state
        action_values = np.zeros(self.n_actions)
        for a in range(self.n_actions):
            action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * V))
        # Choose the action with the maximum value
        policy[s] = np.argmax(action_values)
    
    return policy, V