In [2]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
### MLP
class Multi_Layer_Perceptron(nn.Sequential):
    def __init__(self, input_dim, intern_dim, output_dim, depth = 2, isBiased = False):
        
        dict = OrderedDict([("input",nn.Linear(input_dim,intern_dim, bias=isBiased))])
        for i in range(depth):
            dict.update({str(i) : nn.Linear(intern_dim,intern_dim,bias=isBiased)})
        dict.update({"output" : nn.Linear(intern_dim,output_dim,bias=isBiased)})

        super().__init__(dict)

        self.reset_init_weights_biases(0) # so that we do not use a default initialization

    def reset_init_weights_biases(self, norm = -1):
        for layer in self.children():
            if norm == -1:
                stdv = 1. / math.sqrt(layer.weight.size(1))
            else :
                stdv = norm
            layer.weight.data.fill_(stdv)
            if layer.bias is not None:
                layer.bias.data.fill_(stdv)

In [4]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [5]:
class RandomAgent(Agent):
    def observe(self, state, action, next_state, reward):
        pass
        
    def select_action(self, state):
        return random.randint(0,1)
        
        
    def update(self):
        pass

    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if(debug_mode) : print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            
            while not done:
                                        
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                self.observe(state,action,next_state,reward)
                self.update()

                episode_reward += reward
                state = next_state
                done = terminated or truncated

            episodesHistory[i] = episode_reward
            if(debug_mode) : print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
        print(episodesHistory[:])

In [6]:
class DQNAgent(Agent):

    def __init__(self, id, epsilonMax, epsilonMin = 0.05, Q = Multi_Layer_Perceptron(input_dim = 3,intern_dim = 64, output_dim = 1, depth = 2, isBiased = False), env = gym.make('MountainCar-v0'), arrayNewPosReward = None, arrayNewVelReward = None, contReward = False, gamma = 0.99, replay_buffer_SIZE = 10000, batch_size = 64, observation_SIZE = 6, optimizer = torch.optim.AdamW):
        Agent.__init__(self,id,env)
        self.Q = Q
        self.QTarget = Q
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.replay_buffer = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.batch_size = batch_size
        self.optimizer = optimizer(self.Q.parameters())
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.arrayNewPosReward = arrayNewPosReward
        self.arrayNewVelReward = arrayNewVelReward
        self.contReward = contReward
        
    def observe(self, state, action, next_state, reward):
        state = np.array(state)
        action = np.array([action])
        next_state = np.array(next_state)
        reward = np.array([reward])

        concatenatation = np.concatenate((state, action, next_state, reward))
        return concatenatation
        
    def select_action(self, state):
        P = random.uniform(0,1)
        a=0
        if P <= 1-self.epsilon :
            A=np.zeros(3)
            o = []
            ArgQmax = 0
            for k in range(3):
                A[k] = self.Q(torch.from_numpy(np.concatenate((np.array(state), np.array([k])))).to(torch.float32))
            a = np.argmax(A)
            for k in range(3):
                if A[k] == A[a]:
                    o.append(k)
            a =random.choice(o)
                
        else:
            a = random.randint(0,2)
        return a
        
        
    def update(self,j):

        LossFct = torch.nn.MSELoss(reduction='mean')
        #print(min(j,self.replay_buffer.shape[0]))
        batch = np.random.choice(min(j,self.replay_buffer.shape[0]), self.batch_size)
        target = torch.zeros((self.batch_size))
        input = torch.zeros((self.batch_size))

        for i in range(self.batch_size):

            if self.replay_buffer[batch[i],3] <0.5:
                A0 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([0])))).to(torch.float32)
                A1 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([1])))).to(torch.float32)
                A2 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i],3:5], np.array([2])))).to(torch.float32)
                target[i] = self.replay_buffer[batch[i],5] + self.gamma*max(self.QTarget(A0),self.QTarget(A1),self.QTarget(A2))
            else:
                target[i] = self.replay_buffer[batch[i],5]
                
            input[i] = self.Q(torch.from_numpy(self.replay_buffer[batch[i],:3]).to(torch.float32))

        loss = LossFct(input, target)
        for param in self.Q.parameters():
            param.grad = None
            
        loss.backward()

        grad = 0
        for layer in self.Q.children():
            grad += layer.weight.grad.mean()
        
        self.optimizer.step()
        return loss.item(), abs(grad)

    def customReward(self, state, action, currentReward, next_state, uniqueReward, excludePassivity):
        reward = -1

        if excludePassivity and action == 1:
            reward -= 2

        if self.contReward:
            reward = (abs(next_state[0] + 0.5)**2)/200
        
        if self.arrayNewPosReward.all() != None:
            for k in range(self.arrayNewPosReward.shape[0]):
                #print(i)
                i = self.arrayNewPosReward[k,0]
                if (i <= next_state[0] and i +0.5 >=0) or (i >= next_state[0] and i +0.5 <=0):
                    if self.arrayNewPosReward[k,2] == 0:
                        if self.arrayNewPosReward[k,1] > reward:
                            reward = self.arrayNewPosReward[k,1]
                            if uniqueReward: self.arrayNewPosReward[k,2] = 1
                        
        if self.arrayNewVelReward.all() != None:
            for k in range(self.arrayNewVelReward.shape[0]):
                i = self.arrayNewVelReward[k,0]
                if (i < state[1] and i>0 and action == 2) or (i > state[1] and i<0 and action == 0):
                    reward += self.arrayNewVelReward[k,1]
                    #print(state[1])
                    #print(action)
                    
        return reward

    def play(self):
        self.epsilon = 0
        newSeed = random.randint(0,100000)
        state,_ = self.env.reset(seed = newSeed)
        done = False
                    
        while done == False:
                                        
            action = self.select_action(state)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
        self.env.close()
    
    def train(self, episodes,lossLim = 0, limitStep = 200, refreshTarget = 200, refreshQ = 1, buffer_fill = True, epsilonDecreasing =100, debug_mode=False, recap_mode=False, reset_init = False, epsilon_decrease = True, uniqueReward = False, excludePassivity = True):
        episodesHistory = np.zeros((episodes))
        rewardHistory = np.zeros((episodes*limitStep))
        lossHistory = np.zeros((int(episodes*limitStep/refreshQ)))
        gradHistory = np.zeros((int(episodes*limitStep/refreshQ)))
        cumulativeHistory = np.zeros((episodes))
        self.QTarget = self.Q
        if reset_init != False: self.Q.reset_init_weights_biases(reset_init)
        j=0
        self.replay_buffer = np.zeros((self.replay_buffer_SIZE, self.observation_SIZE))
        k=0
        for e in range(episodes):
            l=0
            terminated = False
            if debug_mode: print("Episode: "+str(e+1)+" starts")
                
            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-e/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            if self.arrayNewPosReward.all() != None:
                for i in range(self.arrayNewPosReward.shape[0]):
                    self.arrayNewPosReward[i,2] = 0

            s=1
            while done == False:
                j+=1               
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                
                if s >= limitStep:
                    truncated = True
                else:
                    truncated = False

                s+=1
                reward = self.customReward(state, action,reward, next_state, uniqueReward, excludePassivity)
                if debug_mode: print("Action "+str(k)+" selected: "+str(action)+" Reward: "+ str(reward))
                observe = self.observe(state,action,next_state,reward)
                
                if k < self.replay_buffer_SIZE: self.replay_buffer[k] = observe
                elif buffer_fill: k=-1
                k+=1
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated
                if j%refreshQ == 0:
                    l,g = self.update(j)
                    lossHistory[int(j/refreshQ)-1] = l
                    gradHistory[int(j/refreshQ)-1] = g
                if j%refreshTarget == 0:
                    print("Loss: "+str(l))
                    self.QTarget = self.Q
                    #for param in self.Q.parameters():
                        #print(param.data)

                rewardHistory[j-1] = reward
                
            if terminated:
                if e > 0:
                    cumulativeHistory[e] = cumulativeHistory[e-1] +1
                else:
                    cumulativeHistory[e] = 1
            else:
                if e > 0:
                    cumulativeHistory[e] = cumulativeHistory[e-1]
                else:
                    cumulativeHistory[e] = 0
                                    
            if debug_mode or recap_mode: print("Episode "+str(e+1)+ " , Reward: "+str(episode_reward)+" Epsilon: "+str(self.epsilon))
            episodesHistory[e] = episode_reward
            
            if l <= lossLim:
                print("Loss reaches limit")
                break
        return episodesHistory, rewardHistory, lossHistory, gradHistory, cumulativeHistory

In [6]:
u=0.25
episodes = 1000
lossLimTrain = -1
batch_size = 500
refreshQ = 100
refreshTarget = 1000
reset_init = 0.1
replay_buffer_SIZE = 20000
epsilonDecreasing = 200

NewPosReward=np.array([(-1.1,u*7,0),(-1,u*6,0),(-0.9,u*5,0),(-0.8,u*4,0),(-0.7,u*3,0),(-0.6,u*2,0),(-0.4,u*2,0),(-0.3,u*3,0),(-0.2,u*4,0),(-0.1,u*5,0),(0,u*6,0),(0.1,u*7,0),(0.2,u*8,0),(0.3,u*9,0),(0.4,u*10,0),(0.5,20*u,0)])
NewVelReward=np.array([(0.001,1),(-0.001,1)])
No = np.array([None])
DQN = DQNAgent("id0",epsilonMax = 1, epsilonMin = 0.05, batch_size = batch_size, contReward = False, arrayNewPosReward = NewPosReward, arrayNewVelReward = NewVelReward, replay_buffer_SIZE = replay_buffer_SIZE)
episodesHistory, rewardHistory, lossHistory, gradHistory, cumulativeHistory= DQN.train(episodes,lossLim=lossLimTrain,limitStep = 200,buffer_fill = True, refreshTarget = refreshTarget, refreshQ = refreshQ, debug_mode = False, recap_mode=True, reset_init = reset_init, uniqueReward = True, excludePassivity = False, epsilonDecreasing =epsilonDecreasing)


Episode 1 , Reward: -154.5 Epsilon: 1.0


KeyboardInterrupt: 

In [None]:
fig1,ax1 = plt.subplots(1,1)
ax1.plot(range(episodesHistory.shape[0]), episodesHistory, marker='.')
ax1.set_xlabel(r'episodes $t$')
ax1.set_ylabel(r'$Total Reward$')
ax1.set_xscale('linear')
ax1.set_yscale('linear')
plt.grid(color='black', which="both", linestyle='-', linewidth=0.2)
plt.savefig(f'figures/TotalReward_epochs{str(episodes)}_u{str(u)}_bS{str(batch_size)}_rQ{str(refreshQ)}_rT{str(refreshTarget)}_rI{str(reset_init)}_rB{str(replay_buffer_SIZE)}_ep{str(epsilonDecreasing)}.png', format='png')

fig2,ax2 = plt.subplots(1,1)
ax2.plot(range(rewardHistory.shape[0]), rewardHistory, marker='.')
ax2.set_xlabel(r'Step $t$')
ax2.set_ylabel(r'$Reward$')
ax2.set_xscale('linear')
ax2.set_yscale('linear')
plt.grid(color='black', which="both", linestyle='-', linewidth=0.2)
plt.savefig(f'figures/Reward_epochs{episodes}_u{u}_bS{batch_size}_rQ{refreshQ}_rT{refreshTarget}_rI{reset_init}_rB{replay_buffer_SIZE}_ep{epsilonDecreasing}.png', format='png')


fig3,ax3 = plt.subplots(1,1)
ax3.plot(range(lossHistory.shape[0]), lossHistory, marker='.')
ax3.set_xlabel(r'Update $t$')
ax3.set_ylabel(r'$Loss$')
ax3.set_xscale('linear')
ax3.set_yscale('log')
#plt.ylim(top=0.15)
plt.grid(color='black', which="both", linestyle='-', linewidth=0.2)
plt.savefig(f'figures/Loss_epochs{episodes}_u{u}_bS{batch_size}_rQ{refreshQ}_rT{refreshTarget}_rI{reset_init}_rB{replay_buffer_SIZE}_ep{epsilonDecreasing}.png', format='png')

fig4,ax4 = plt.subplots(1,1)
ax4.plot(range(gradHistory.shape[0]), gradHistory, marker='.')
ax4.set_xlabel(r'Update $t$')
ax4.set_ylabel(r'$Grad$')
ax4.set_xscale('linear')
ax4.set_yscale('log')
plt.grid(color='black', which="both", linestyle='-', linewidth=0.2)
plt.savefig(f'figures/Grad_epochs{episodes}_u{u}_bS{batch_size}_rQ{refreshQ}_rT{refreshTarget}_rI{reset_init}_rB{replay_buffer_SIZE}_ep{epsilonDecreasing}.png', format='png')


s=0
for e in range(cumulativeHistory.shape[0]):
    if cumulativeHistory[e] == 1:
        s = e
        break
fig5,ax5 = plt.subplots(1,1)
ax5.plot(range(cumulativeHistory.shape[0] - s), cumulativeHistory[s:], marker='.')
ax5.set_xlabel(r'episodes: first_success: ep '+str(s))
ax5.set_ylabel(r'$CumulativeSucess$')
ax5.set_xscale('linear')
ax5.set_yscale('linear')
plt.grid(color='black', which="both", linestyle='-', linewidth=0.2)
plt.savefig(f'figures/CumulativeSucess_epochs{episodes}_u{u}_bS{batch_size}_rQ{refreshQ}_rT{refreshTarget}_rI{reset_init}_rB{replay_buffer_SIZE}_ep{epsilonDecreasing}.png', format='png')

In [7]:
DQN.env = gym.make('MountainCar-v0', render_mode='human')
DQN.play()

KeyboardInterrupt: 

In [7]:
discr_step = np.array([0.025, 0.005])
env = gym.make('MountainCar-v0')
(env.observation_space.high - env.observation_space.low)//discr_step

array([72., 28.])

In [None]:

# cf code cours, après avoir obtenu un modèle de l'environnement, résoudre le porblème d'optimisation avec le dynamic programming

class DynaAgent(Agent):
    
    def __init__(self, id, epsilonMax = 0.9, epsilonMin = 0.05,env = gym.make('MountainCar-v0'), discr_step = np.array([0.025, 0.005]), gamma = 0.99, k=0, visited_state_action = set()):
        Agent.__init__(self,id,env)
        self.discr_step = discr_step
        self.n_xbins = np.round(((env.observation_space.high - env.observation_space.low)/self.discr_step)[0]).astype(np.int32)
        self.n_vbins = np.round(((env.observation_space.high - env.observation_space.low)/self.discr_step)[1]).astype(np.int32)
        self.n_states = self.n_xbins*self.n_vbins
        self.n_actions = 3
        self.gamma = gamma
        self.epsilon = epsilonMax
        self.epsilonMax = epsilonMax
        self.epsilonMin = epsilonMin
        self.k = k
        self.visited_state_action = visited_state_action
        
        self.N = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        
        self.P = np.zeros(shape=(self.n_states, self.n_actions, self.n_states))
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = torch.rand(size=(self.n_states,))
                self.P[i, j, :] = random/random.sum()

        self.R = np.zeros(shape=(self.n_states, self.n_actions))

        self.W = - np.ones(shape=(self.n_states, self.n_actions))
        terminal_x_bin = self.discretize_x(0.5)
        for state in range(terminal_x_bin, self.n_states):
            self.W[state, :] = 0

        
        self.Q = np.zeros(shape=(self.n_states, self.n_actions))
    

    # On obtient les s du (s, a, s') en discrétisant (cf plus haut), puis pour les s' on utilise une loi uniforme pour chaque paire (s, a)

    def discretize_x(self, x):
        x_bin = np.round(((x - env.observation_space.low)/discr_step)[0]).astype(np.int32)
        return x_bin*(self.n_vbins) 

    def discretize_v(self, v):
        v_bin = np.round(((v - env.observation_space.low)/discr_step)[1]).astype(np.int32)
        return v_bin 

    def discretize(self, state):
        x_bin = self.discretize_x(state[0])
        v_bin = self.discretize_v(state[1])
        return x_bin + v_bin

    """
    @property
    def P(self):
        shape = (self.n_states, self.n_actions, self.n_states) 
        P = np.zeros(shape=shape)
        for i in range(self.n_states):
            for j in range(self.n_actions):
                random = torch.rand(size=(self.n_states,))
                P[i, j, :] = random/random.sum()
        return P

    
    @property
    def N(self):
        shape = (self.n_states, self.n_actions, self.n_states) 
        return np.zeros(shape=shape)
    

    @property
    def R(self):
        shape = (self.n_states, self.n_actions) 
        return np.zeros(shape=shape)

                     
        
    @property
    def W(self):
        shape = (self.n_states, self.n_actions) 
        W = - np.ones(shape=shape)

        terminal_x_bin = self.discretize_x(0.5)
    
        for state in range(terminal_x_bin, self.n_states):
            W[state, :] = 0

        return W
                                                 

    @property
    def Q(self):
        shape = (self.n_states, self.n_actions) 
        return np.zeros(shape=shape)

    """
    
    def update(self, state, action, next_state, reward):
        discr_state, discr_next_state = self.discretize(state), self.discretize(next_state)
        
        self.N[discr_state, action, discr_next_state] += 1
        self.visited_state_action.append((discr_state, action))

        
        self.P[discr_state, action, discr_next_state] = self.N[discr_state, action, discr_next_state]/(np.sum(self.N[discr_state, action]))
        
        self.W[discr_state, action] += reward
        self.R[discr_state, action] = self.W[discr_state, action]/(np.sum(self.N[discr_state, action]))

        self.Q[discr_state, action] = self.R[discr_state, action] + (self.gamma)*np.array([(self.P[discr_state, action, discr_next_local])*max(self.Q[discr_next_local, :]) for discr_next_local in range(self.n_states)]).sum()

        self.k = int(len(self.visited_state_action)//10)


        sampled_states = random.sample(self.visited_state_action, self.k)

        for (random_state, random_action) in sampled_states:
            self.Q[random_state, random_action] = self.R[random_state, random_action] + (self.gamma)*np.array([(self.P[random_state, random_action, discr_next_local])*max(self.Q[discr_next_local, :]) for discr_next_local in range(self.n_states)]).sum()
            
        
    def observe(self):
        pass
    
    def select_action(self, state):
        state_bin = self.discretize(state)
        p = random.uniform(0,1)
        a=0
        if p < 1-self.epsilon :
            a = np.argmax(self.Q[state_bin, :])
        else:
            a = random.randint(0,2)
            
        return a


    
    # Reinitialisation de R, et N ?

    
    def train(self, episodes, debug_mode=True, epsilon_decrease=True, epsilonDecreasing=100):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if debug_mode: print("Episode: "+str(i+1)+" starts")


            if epsilon_decrease: 
                if self.epsilon > self.epsilonMin:
                    self.epsilon = self.epsilonMax*math.exp(-np.e/epsilonDecreasing)
            else:
                self.epsilon = self.epsilonMax
                
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            
            done = False
            episode_reward = 0
            k=0
            
            while not done:
            
                action = self.select_action(state)
                if debug_mode: print("Action :"+str(k)+" selected: "+str(action))
                k+=1
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                self.update(state, action, next_state, reward)
                
                episode_reward += reward
                state = next_state
                done = terminated or truncated

            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
           
            episodesHistory[i] = episode_reward
        return episodesHistory[:]

In [14]:
A = DynaAgent(1)

In [17]:
A.P

array([[[4.30088694e-04, 9.56921067e-05, 9.18481674e-04, ...,
         4.14301205e-04, 3.00056185e-04, 6.76981814e-04],
        [7.78097310e-04, 2.89272375e-05, 5.00049617e-04, ...,
         7.00163364e-04, 2.72828853e-04, 4.14978742e-04],
        [5.47259173e-04, 7.81918061e-05, 3.97732249e-04, ...,
         1.33216163e-04, 9.01781314e-04, 8.26434931e-04]],

       [[1.94012091e-06, 4.68581187e-04, 6.93704875e-04, ...,
         6.02930668e-04, 1.86541933e-04, 1.63529912e-04],
        [6.02302025e-04, 5.48511336e-04, 1.69379404e-04, ...,
         5.71682285e-05, 6.87830732e-04, 9.94519214e-04],
        [6.56312739e-04, 7.77193287e-04, 8.66895716e-04, ...,
         1.04135383e-04, 1.18378601e-04, 4.84000018e-04]],

       [[1.18075390e-04, 7.30634929e-05, 8.15048697e-04, ...,
         3.85990890e-04, 3.59918195e-04, 7.94714782e-04],
        [3.06812552e-04, 5.06804325e-04, 5.12513689e-05, ...,
         8.07305798e-04, 3.44019616e-04, 3.80475074e-04],
        [8.68938514e-04, 8.84373963e

In [18]:
A.train(2)

Episode: 1 starts
Action :0 selected: 1
Action :1 selected: 2
Action :2 selected: 0
Action :3 selected: 1
Action :4 selected: 0
Action :5 selected: 0
Action :6 selected: 2
Action :7 selected: 0
Action :8 selected: 2
Action :9 selected: 2
Action :10 selected: 0
Action :11 selected: 1
Action :12 selected: 0
Action :13 selected: 2
Action :14 selected: 2
Action :15 selected: 2
Action :16 selected: 2
Action :17 selected: 1
Action :18 selected: 0
Action :19 selected: 0
Action :20 selected: 0
Action :21 selected: 0
Action :22 selected: 0
Action :23 selected: 0
Action :24 selected: 0
Action :25 selected: 0
Action :26 selected: 1
Action :27 selected: 0
Action :28 selected: 1
Action :29 selected: 0
Action :30 selected: 2
Action :31 selected: 2
Action :32 selected: 2
Action :33 selected: 1
Action :34 selected: 0
Action :35 selected: 1
Action :36 selected: 0
Action :37 selected: 1
Action :38 selected: 2
Action :39 selected: 0
Action :40 selected: 1
Action :41 selected: 1
Action :42 selected: 2
Act

array([-200., -200.])

In [12]:
A.Q

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [145]:
(A.W>-1).sum()

336

In [146]:
tab1 = np.array([0, -0.00069564])
tab2 = np.array([-0.54452115, -0.00069564])

In [147]:
A.update(tab1, 2, tab2, -1)

In [None]:
env.reset()
env.step(1)
env.step(0)