In [1]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import scipy.sparse as sp

In [2]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [3]:
discr_step = np.array([0.025, 0.005])
env = gym.make('MountainCar-v0')
(env.observation_space.high - env.observation_space.low)//discr_step

array([72., 28.])

In [28]:

# cf code cours, après avoir obtenu un modèle de l'environnement, résoudre le porblème d'optimisation avec le dynamic programming

class DynaAgent(Agent):
    
    def __init__(self, id, env=gym.make('MountainCar-v0'), epsilonMax = 0.9, epsilonMin = 0.05, discr_step = np.array([0.025, 0.02]), gamma = 0.99, k=0, k_fixed = True, alpha=0.2, observation_SIZE = 6, replay_buffer_SIZE = 10000):
        Agent.__init__(self,id,env)
        self.discr_step = discr_step
        self.n_xbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        print(self.n_xbins)
        print(self.env.observation_space.high)
        print(self.env.observation_space.low)
        self.n_vbins = np.round(((self.env.observation_space.high - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        print(self.n_vbins)
        self.n_states = self.n_xbins*self.n_vbins
        self.n_actions = 3
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.k = k
        self.k_fixed = k_fixed
        self.alpha = alpha
        '''
        Definition of the replay buffer
        '''
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.visited_state_action_Array = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.visited_state_action = set()
        
        self.N = np.zeros((self.n_states, self.n_actions, self.n_states))
        
        self.P = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = np.random.rand(self.n_states)
                self.P[i, j, :] = random/random.sum()
        
        self.R = - np.ones(shape=(self.n_states, self.n_actions))
        self.terminal_x_bin = self.discretize_x(0.5)*self.n_vbins
        print(self.terminal_x_bin)
        
        self.Q = np.zeros(shape=(self.n_states, self.n_actions))

        self.lossHistory = 0
        self.up_count = 0
    

    # On obtient les s du (s, a, s') en discrétisant (cf plus haut), puis pour les s' on utilise une loi uniforme pour chaque paire (s, a)

    def discretize_x(self, x):
        x_bin = np.round(((x - self.env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        return x_bin

    def discretize_v(self, v):
        v_bin = np.round(((v - self.env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        return v_bin 

    def discretize(self, state):
        x_bin = self.discretize_x(state[0])
        v_bin = self.discretize_v(state[1])
        return x_bin*self.n_vbins + v_bin

    
   
    def update(self, state, action, next_state, reward):
        discr_state, discr_next_state = self.discretize(state), self.discretize(next_state)

        self.visited_state_action.add((discr_state,action))
        self.N[discr_state,action, discr_next_state] += 1

        
        total_visited = self.N[discr_state,action,:].sum()

        if total_visited > 0:
            self.P[discr_state, action, :] = self.N[discr_state, action,  :] / total_visited
            self.R[discr_state, action] = (self.R[discr_state, action]*(total_visited-1) + reward) / total_visited
            
        start = time.time()

        if discr_state < self.terminal_x_bin:
            self.Q[discr_state, action] = reward + (self.gamma)*(self.P[discr_state, action,:]*np.max(self.Q, axis = 1)[:]).sum()
        else:
            self.Q[discr_state, action] = reward

        self.lossHistory[self.up_count]= self.Q[discr_state, action]
        self.up_count += 1
        
        if not self.k_fixed:
            self.k = len(self.visited_state_action) // 10
            print("K changes")
            
        sampled_states = []
        if self.k >= 1:
            sampled_states = random.choices(list(self.visited_state_action), k = self.k)

            for (random_state, random_action) in sampled_states:
                if random_state < self.terminal_x_bin:
                    self.Q[random_state, random_action] = self.R[random_state, random_action] + (self.gamma)*(self.P[random_state, random_action,:]*np.max(self.Q, axis = 1)[:]).sum()
                else:
                    self.Q[random_state, random_action] = self.R[random_state, random_action]

                self.lossHistory[self.up_count]= self.Q[random_state, random_action]
                self.up_count += 1
        #print(time.time() - start)

    def observe(self):
        pass
    
    def select_action(self, state):
        state_bin = self.discretize(state)
        p = random.uniform(0,1)
        a=0
        if p < 1-self.epsilon :
            a = np.argmax(self.Q[state_bin,:])
        else:
            a = random.randint(0,2)
            
        return a
    '''
    Select actions without exploration (for the tests)
    '''  
    def select_best_action(self, state):
        state_bin = self.discretize(state)
        return np.argmax(self.Q[state_bin,:])
    '''
    Test the agent on a seed (random or not) after the training
    '''  
    def play(self, seed = False):
        newSeed = random.randint(0,100000)
        
        if seed != False:
            newSeed = seed
            
        state,_ = self.env.reset(seed = newSeed)
        done = False
                    
        while done == False:
                                        
            action = self.select_best_action(state)
            print(action)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            state = next_state
            done = terminated or truncated
        self.env.close()

    """
    # Reinitialisation de R nécessaire ?
    def reset_for_episode(self):
        self.N_episode = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        
        self.W_episode = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.W_episode[state, :] = 0

    """  

    
    def train(self, episodes, debug_mode=True, epsilon_decrease=True, epsilonDecreasing=100):
        episodesHistory = np.zeros((episodes))
        self.lossHistory = np.zeros((int(episodes*200*(self.k+1))))
        self.up_count = 0
        for i in range(episodes):

            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-i/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            
            done = False
            episode_reward = 0
            
            while not done:
            
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                self.update(state, action, next_state, reward)
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated
                if terminated: print("Terminated")

            #self.reset_for_episode()

             
            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward)+" Epsilon: "+str(self.epsilon))
            episodesHistory[i] = episode_reward
        return episodesHistory, self.Q.mean(), self.R.mean(), np.max(A.P, axis=2).mean()

In [29]:
A = DynaAgent("id0", k = 200)
A.train(50, debug_mode=True)

72
[0.6  0.07]
[-1.2  -0.07]
7
476


  2%|▏         | 1/50 [00:00<00:44,  1.11it/s]

Episode 1 , Reward: -200.0 Epsilon: 0.9


  4%|▍         | 2/50 [00:01<00:41,  1.15it/s]

Episode 2 , Reward: -200.0 Epsilon: 0.8910448503742513


  6%|▌         | 3/50 [00:02<00:41,  1.13it/s]

Episode 3 , Reward: -200.0 Epsilon: 0.8821788059760798


  8%|▊         | 4/50 [00:03<00:38,  1.20it/s]

Episode 4 , Reward: -200.0 Epsilon: 0.8734009801936573


 10%|█         | 5/50 [00:04<00:35,  1.26it/s]

Episode 5 , Reward: -200.0 Epsilon: 0.8647104952370909


 12%|█▏        | 6/50 [00:04<00:34,  1.29it/s]

Episode 6 , Reward: -200.0 Epsilon: 0.8561064820506427


 14%|█▍        | 7/50 [00:05<00:32,  1.31it/s]

Episode 7 , Reward: -200.0 Epsilon: 0.8475880802258239


 16%|█▌        | 8/50 [00:06<00:31,  1.31it/s]

Episode 8 , Reward: -200.0 Epsilon: 0.8391544379153535


 18%|█▊        | 9/50 [00:07<00:30,  1.34it/s]

Episode 9 , Reward: -200.0 Epsilon: 0.8308047117479722


 20%|██        | 10/50 [00:07<00:29,  1.35it/s]

Episode 10 , Reward: -200.0 Epsilon: 0.8225380667441053


 22%|██▏       | 11/50 [00:08<00:28,  1.36it/s]

Episode 11 , Reward: -200.0 Epsilon: 0.8143536762323635


 24%|██▍       | 12/50 [00:09<00:27,  1.36it/s]

Episode 12 , Reward: -200.0 Epsilon: 0.8062507217668754


 26%|██▌       | 13/50 [00:10<00:27,  1.36it/s]

Episode 13 , Reward: -200.0 Epsilon: 0.7982283930454418


 28%|██▊       | 14/50 [00:10<00:26,  1.36it/s]

Episode 14 , Reward: -200.0 Epsilon: 0.7902858878285052


 30%|███       | 15/50 [00:11<00:26,  1.33it/s]

Episode 15 , Reward: -200.0 Epsilon: 0.7824224118589252


 32%|███▏      | 16/50 [00:12<00:25,  1.35it/s]

Episode 16 , Reward: -200.0 Epsilon: 0.774637178782552


 34%|███▍      | 17/50 [00:12<00:24,  1.35it/s]

Episode 17 , Reward: -200.0 Epsilon: 0.7669294100695903


 36%|███▌      | 18/50 [00:13<00:23,  1.34it/s]

Episode 18 , Reward: -200.0 Epsilon: 0.7592983349367454


 38%|███▊      | 19/50 [00:14<00:22,  1.35it/s]

Episode 19 , Reward: -200.0 Epsilon: 0.7517431902701448


 40%|████      | 20/50 [00:15<00:22,  1.34it/s]

Episode 20 , Reward: -200.0 Epsilon: 0.744263220549026


 42%|████▏     | 21/50 [00:15<00:21,  1.35it/s]

Episode 21 , Reward: -200.0 Epsilon: 0.7368576777701836


 44%|████▍     | 22/50 [00:16<00:22,  1.22it/s]

Episode 22 , Reward: -200.0 Epsilon: 0.7295258213731683


 46%|████▌     | 23/50 [00:17<00:22,  1.21it/s]

Episode 23 , Reward: -200.0 Epsilon: 0.7222669181662307


 48%|████▊     | 24/50 [00:18<00:21,  1.22it/s]

Episode 24 , Reward: -200.0 Epsilon: 0.7150802422530007


 50%|█████     | 25/50 [00:19<00:19,  1.26it/s]

Episode 25 , Reward: -200.0 Epsilon: 0.7079650749598981


 52%|█████▏    | 26/50 [00:20<00:18,  1.30it/s]

Episode 26 , Reward: -200.0 Epsilon: 0.7009207047642644


 54%|█████▍    | 27/50 [00:20<00:17,  1.32it/s]

Episode 27 , Reward: -200.0 Epsilon: 0.6939464272232096


 56%|█████▌    | 28/50 [00:21<00:16,  1.31it/s]

Episode 28 , Reward: -200.0 Epsilon: 0.6870415449031678


 58%|█████▊    | 29/50 [00:22<00:15,  1.34it/s]

Episode 29 , Reward: -200.0 Epsilon: 0.6802053673101529


 60%|██████    | 30/50 [00:22<00:14,  1.36it/s]

Episode 30 , Reward: -200.0 Epsilon: 0.6734372108207087


 62%|██████▏   | 31/50 [00:23<00:13,  1.36it/s]

Episode 31 , Reward: -200.0 Epsilon: 0.666736398613546


 64%|██████▍   | 32/50 [00:24<00:13,  1.36it/s]

Episode 32 , Reward: -200.0 Epsilon: 0.6601022606018603


 66%|██████▌   | 33/50 [00:25<00:12,  1.36it/s]

Episode 33 , Reward: -200.0 Epsilon: 0.6535341333663218


 68%|██████▊   | 34/50 [00:25<00:11,  1.35it/s]

Episode 34 , Reward: -200.0 Epsilon: 0.6470313600887335


 70%|███████   | 35/50 [00:26<00:11,  1.35it/s]

Episode 35 , Reward: -200.0 Epsilon: 0.6405932904863487


 72%|███████▏  | 36/50 [00:27<00:10,  1.34it/s]

Episode 36 , Reward: -200.0 Epsilon: 0.6342192807468421


 74%|███████▍  | 37/50 [00:28<00:09,  1.35it/s]

Episode 37 , Reward: -200.0 Epsilon: 0.627908693463928


 76%|███████▌  | 38/50 [00:28<00:09,  1.30it/s]

Episode 38 , Reward: -200.0 Epsilon: 0.6216608975736192


 78%|███████▊  | 39/50 [00:29<00:08,  1.31it/s]

Episode 39 , Reward: -200.0 Epsilon: 0.6154752682911203


 80%|████████  | 40/50 [00:30<00:07,  1.30it/s]

Episode 40 , Reward: -200.0 Epsilon: 0.6093511870483482


 82%|████████▏ | 41/50 [00:31<00:06,  1.30it/s]

Episode 41 , Reward: -200.0 Epsilon: 0.6032880414320754


 84%|████████▍ | 42/50 [00:32<00:06,  1.29it/s]

Episode 42 , Reward: -200.0 Epsilon: 0.5972852251226874


 86%|████████▌ | 43/50 [00:33<00:05,  1.18it/s]

Episode 43 , Reward: -200.0 Epsilon: 0.591342137833551


 88%|████████▊ | 44/50 [00:34<00:05,  1.15it/s]

Episode 44 , Reward: -200.0 Epsilon: 0.5854581852509849


 90%|█████████ | 45/50 [00:34<00:04,  1.18it/s]

Episode 45 , Reward: -200.0 Epsilon: 0.5796327789748272


 92%|█████████▏| 46/50 [00:35<00:03,  1.22it/s]

Episode 46 , Reward: -200.0 Epsilon: 0.573865336459596


 94%|█████████▍| 47/50 [00:36<00:02,  1.25it/s]

Episode 47 , Reward: -200.0 Epsilon: 0.5681552809562334


 96%|█████████▌| 48/50 [00:37<00:01,  1.27it/s]

Episode 48 , Reward: -200.0 Epsilon: 0.5625020414544307


 98%|█████████▊| 49/50 [00:37<00:00,  1.27it/s]

Episode 49 , Reward: -200.0 Epsilon: 0.5569050526255268


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]

Episode 50 , Reward: -200.0 Epsilon: 0.5513637547659745





(array([-200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200., -200., -200., -200., -200.,
        -200., -200., -200., -200., -200.]),
 -8.38010683879843,
 -1.0,
 0.2340591788909888)

In [9]:
Dyna_Test = DynaAgent("idTest", env=gym.make('MountainCar-v0', render_mode='human'))
Dyna_Test.Q = A.Q
Dyna_Test.play(seed = False)

72
7
476
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
2
2
1
0
0
0
0
2
1
2
2
2
1
2
1
2
2
2
2
2
2
2
2
1
2
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
2
1
2
0
0
0
1
2
0
2
2
2
2
2
1
1
2
1
0
0
0
2
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
1
0


In [None]:
# Code Prof

def value_iteration(self, theta=0.001):
    """
    P : 3D array representing transition probabilities, P[s,a,s'] is the probability of transitioning from s to s' under action a.
    R : 2D array representing rewards for each state-action pair
    gamma : discount factor
    theta : stopping criterion
    """
    V = np.zeros(self.n_states)

    while True:
        delta = 0
        for s in range(self.n_states):
            v = V[s]
            # Calculate the value of each action in the current state
            action_values = np.zeros(self.n_actions)
            for a in range(self.n_actions):
                action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * v))
            # Update the value function
            V[s] = np.max(action_values)
            # Update the change in value function
            delta = max(delta, np.abs(v - V[s]))

        # If the change in value function is smaller than theta, stop
        if delta < theta:
            break
    
    policy = np.zeros(self.n_states, dtype=int)
    for s in range(self.n_states):
        # Calculate the value of each action in the current state
        action_values = np.zeros(self.n_actions)
        for a in range(self.n_actions):
            action_values[a] = np.sum(self.P[s, a] * (self.R[s, a] + self.gamma * V))
        # Choose the action with the maximum value
        policy[s] = np.argmax(action_values)
    
    return policy, V