In [1]:
from abc import ABC, abstractmethod
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
from collections import OrderedDict
import math
import random

In [2]:
### MLP
class Multi_Layer_Perceptron(nn.Sequential):
    def __init__(self, input_dim, intern_dim, output_dim, depth = 2, isBiased = False):
        
        dict = OrderedDict([("input",nn.Linear(input_dim,intern_dim, bias=isBiased))])
        for i in range(depth):
            dict.update({str(i) : nn.Linear(intern_dim,intern_dim,bias=isBiased)})
        dict.update({"output" : nn.Linear(intern_dim,output_dim,bias=isBiased)})

        super().__init__(dict)

        self.reset_init_weights_biases(0) # so that we do not use a default initialization

    def reset_init_weights_biases(self, norm = None):
        for layer in self.children():
            if norm == None:
                stdv = 1. / math.sqrt(layer.weight.size(1))
            else :
                stdv = norm
            
            layer.weight.data.uniform_(-stdv, stdv)
            if layer.bias is not None:
                layer.bias.data.uniform_(-stdv, stdv)

In [3]:
### CustomLoss DQN
class DQN_Loss(nn.Module):
    def __init__(self):
        super(DQN_Loss, self).__init__()

    def forward(self, input, target):
        loss = (target - input)**2
        return loss

In [4]:
class Agent(ABC):

    @abstractmethod
    def observe(self, state, action, next_state, reward):
        pass

    @abstractmethod
    def select_action(self, state):
        pass
    
    @abstractmethod
    def update(self):
        pass

    @abstractmethod
    def train(self, episodes, debug_mode=False):
        pass

    def __init__(self, id, env):
        self.id = id
        self.env = env
                

In [5]:
class RandomAgent(Agent):
    def observe(self, state, action, next_state, reward):
        pass
        
    def select_action(self, state):
        return random.randint(0,1)
        
        
    def update(self):
        pass

    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if(debug_mode) : print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            
            while not done:
                                        
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                self.observe(state,action,next_state,reward)
                self.update()

                episode_reward += reward
                state = next_state
                done = terminated or truncated

            episodesHistory[i] = episode_reward
            if(debug_mode) : print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
        print(episodesHistory[:])

In [34]:
class DQNAgent(Agent):

    def __init__(self, id, epsilon, Q = Multi_Layer_Perceptron(input_dim = 3,intern_dim = 64, output_dim = 1, depth = 2, isBiased = True), env = gym.make('MountainCar-v0'), arrayNewReward = None, gamma = 0.99, replay_buffer_SIZE = 10000, batch_size = 64, observation_SIZE = 6, optimizer = torch.optim.AdamW):
        Agent.__init__(self,id,env)
        self.Q = Q
        self.gamma = gamma
        self.epsilon = epsilon
        self.replay_buffer = np.zeros((replay_buffer_SIZE,observation_SIZE))
        self.batch_size = batch_size
        self.optimizer = optimizer(self.Q.parameters())
        self.replay_buffer_SIZE = replay_buffer_SIZE
        self.observation_SIZE = observation_SIZE
        self.arrayNewReward = arrayNewReward
        
    def observe(self, state, action, next_state, reward):
        state = np.array(state)
        action = np.array([action])
        next_state = np.array(next_state)
        reward = np.array([reward])

        concatenatation = np.concatenate((state, action, next_state, reward))
        return concatenatation
        
    def select_action(self, state):
        P = random.uniform(0,1)
        a=0
        if P < 1-self.epsilon :
            print(self.env.action_space.shape)
            A=np.zeros(3)
            print(A)
            for k in range(A.shape[0]):
                A[k] = self.Q(torch.from_numpy(np.concatenate((np.array(state), np.array([k])))).to(torch.float32))
            a = np.argmax(A)
        else:
            a = random.randint(0,3)
            
        return a
        
        
    def update(self):

        LossFct = DQN_Loss()
        
        batch = np.random.choice(self.replay_buffer.shape[0], self.batch_size)

        for i in range(self.batch_size):

            A0 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i]][3:5], np.array([0])))).to(torch.float32)
            A1 = torch.from_numpy(np.concatenate((self.replay_buffer[batch[i]][3:5], np.array([1])))).to(torch.float32)
            target = self.replay_buffer[batch[i]][5] + self.gamma*max(self.Q(A0),self.Q(A1))
            input = self.Q(torch.from_numpy(self.replay_buffer[batch[i]][:3]).to(torch.float32))

            loss = LossFct(input, target)
            
            self.optimizer.zero_grad()
            loss.backward()
            #print(loss.item())
            self.optimizer.step()

    def customReward(currentReward):
        reward = currentReward
        if self.arrayNewReward == None:
            return reward
            
        for i in self.arrayNewReward[:][0]:
            if i == next_state[0]:
                reward = self.arrayNewReward[i][1]
                break
        return reward
        
    def train(self, episodes, debug_mode=False):
        episodesHistory = np.zeros((episodes))
        
        for i in range(episodes):
            if debug_mode: print("Episode: "+str(i+1)+" starts")
            newSeed = random.randint(0,100000)
            state,_ = self.env.reset(seed = newSeed)
            done = False
            episode_reward = 0
            self.replay_buffer = np.zeros((self.replay_buffer_SIZE, self.observation_SIZE))
            k=0
            
            while done == False:
                                        
                action = self.select_action(state)
                if debug_mode: print("Action "+str(k)+" selected: "+str(action))
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                reward = self.customReward(currentReward = reward)
                observe = self.observe(state,action,next_state,reward)
                self.replay_buffer[k] = observe
                k+=1
                episode_reward += reward
                state = next_state
                done = terminated or truncated

            if debug_mode: print("Episode "+str(i+1)+ " , Reward: "+str(episode_reward))
            self.update()
            episodesHistory[i] = episode_reward
        return episodesHistory[:]

In [13]:
class CustomEnv(gym.Env):
    def __init__(self, arrayNewReward, render = None, **kwargs):
        super(CustomEnv, self).__init__(**kwargs)
        self.basicEnv = gym.make('MountainCar-v0', render_mode=render)
        self.arrayNewReward = arrayNewReward

    def reset(self,**kwargs):
        return self.env.reset(**kwargs)

    def step(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        for i in self.arrayNewReward[:][0]:
            if i == next_state[0]:
                reward = self.arrayNewReward[i][1]
                break
        return next_state, reward, terminated, truncated, info 

    def close(self):
        self.env.close()



In [32]:
DQN = DQNAgent(0,0.1, env= CustomEnv(arrayNewReward=np.array([(-0.3,10),(0.3,5)])))
print(DQN.train(1000, debug_mode = False))

[-200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200. -200.
 -200. -200. -200. -200. -200. -200. -200. -200. -2

In [35]:
NewReward=np.array([(-0.3,10),(0.3,5),(0.4,20),(0.6,200)])
DQN = DQNAgent(0,0.1, arrayNewReward = NewReward)
DQN.train(1, debug_mode = True)
DQN.env.close()

Episode: 1 starts
()
[0. 0. 0.]
Action 0 selected: 0


TypeError: customReward() got multiple values for argument 'currentReward'

In [40]:
DQN.env.close()