<a href="https://colab.research.google.com/github/udsey/SATO_RL/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from collections import deque

# Train model

In [2]:
class Train:


    def __init__(self, time_limit=2000, v_limit=100, s_limit=300, dt=10, v_delta=1):
        self.time_limit = time_limit # s
        self.v_limit = v_limit # km/h
        self.s_limit = s_limit # km
        self.state_space = np.arange(0, self.s_limit, 0.001) # m
        self.action_space = np.arange(-0.4, 0.5, 0.1) # m/s^2
        self.action_space_high = 0.5
        self.action_space_low = -0.5
        self.dt = dt / 3600 # h
        self.df_limit = pd.DataFrame({'start': [0],
                                      'stop': [self.s_limit],
                                      'limit': self.v_limit,})
        self.v_delta = v_delta
        self.state_dim = 3
        self.action_dim = 1

# Добавление ограничений скорости на участке

    def append_velocity_limit(self, limit, start, stop):
        self.df_limit = self.df_limit.append({'start': start,
                                              'stop': stop,
                                              'limit': limit,}, ignore_index=True)
        return self.df_limit
# Сброс скоростного ограничения к одному на всем участке
    def reset_velocity_limit(self):
        self.append_velocity_limit(self, limit=self.v_limit, 
                                   position=range(0, self.s_limit))
# Сброс окружения к начальному значению
    def reset(self):
        self.s = 0
        self.v = 0
        self.a = 0
        self.done = False
        self.total_time = 0
        self.reward = 0
        self.v_list = []
        self.s_list = []
        self.t_list = []
        self.a_list = []
        self.reward_list = []

        return self.return_state()

# Возвращает текущее состояние
    def return_state(self):
        return np.array([self.s*1000, self.v, self.speed_limit()])

# Возвращает скоростное ограничение на текущем участке 
    def speed_limit(self):
        if self.s < 0:
            return self.df_limit.iloc[0, 2]
    
        for i in range(self.df_limit.shape[0]):
            min_p = self.df_limit.iloc[i, 0]
            max_p = self.df_limit.iloc[i, 1]
            if min_p <= self.s <= max_p:
                return self.df_limit.iloc[i, 2]
            else:
                return min_p
# Возвращает награду в соответствии с текущим состоянием
    def reward_func(self):
        if self.s < 0: # Уехал в обратную сторону
            return -1000, True
        if self.s >= self.s_limit: # Доехал до точки назначения
            self.s = self.s_limit
            return 100, True
        if self.total_time > self.time_limit/3600: # Превысил время
            return -100, True
        if self.v > self.speed_limit()*self.v_delta: # Превысил скорость
            return -500, True
        if self.speed_limit()/2 <= self.v <= self.speed_limit():
            return 1, False
        else:
            return 0, False
        
        

# Шаг для заданного действия
    def step(self, action):
        
        self.total_time += self.dt # h
        self.a = action * (3600**2 / 1000) # km/h^2
        self.s += self.v * self.dt + (self.a * (self.dt ** 2))/2 #km
        self.v += (self.a * self.dt) # km/h

        self.reward, self.done = self.reward_func()

        self.v_list.append(self.v)
        self.t_list.append(self.total_time)
        self.s_list.append(self.s)
        self.a_list.append(self.a)
        self.reward_list.append(self.reward)
        return  self.return_state(), self.reward, self.done

# Справочная информация о движении
    def action_info(self, action):
        print('*'*20)
        print('Ускорение {:.2f} м/с^2'.format(action))
        print('Скорость {:.2f} км/ч' .format(self.v)),
        print('Пройденный путь {:.6f} км'.format(self.s))
        print('Штраф', self.reward)
        print('*'*20)

# Возвращает данные о скорости на каждом участке        
    def speed_legend(self):
        df = pd.DataFrame(columns=['time', 'position', 'speed', 'acceleration', 'reward'])
        df.time = self.t_list
        df.position = self.s_list
        df.speed = self.v_list
        df.acceleration = self.a_list
        df.reward = self.reward_list
        return df




# DQN

In [48]:
class Buffer():

    def __init__(self, buffer_size):

        self.state_buf = deque(maxlen=buffer_size)
        self.next_state_buf = deque(maxlen=buffer_size)
        self.action_buf = deque(maxlen=buffer_size)
        self.reward_buf = deque(maxlen=buffer_size)
        self.done_buf = deque(maxlen=buffer_size)
        self.buffer_size = 0

    def add(self, state, next_state, action, reward, done):

        self.state_buf.append(state)
        self.next_state_buf.append(next_state)
        self.action_buf.append(action)
        self.reward_buf.append(reward)
        self.done_buf.append(done)
        self.buffer_size +=1

    def sample(self, batch_size):
        
        indexes = np.random.randint(self.buffer_size, size=batch_size)
        mb_state = []
        mb_next_state = []
        mb_action = []
        mb_reward = []
        mb_done = []

        for i in indexes:
            mb_state.append(self.state_buf[i])
            mb_next_state.append(self.next_state_buf[i])
            mb_action.append(self.action_buf[i])
            mb_reward.append(self.reward_buf[i])
            mb_done.append(self.done_buf[i])

        return mb_state, mb_next_state, mb_action, mb_reward, mb_done

    def __len__(self):

        return self.buffer_size

In [49]:
def actor_model(state_dim, action_dim, initializer):

    model = Sequential([layers.InputLayer(input_shape=(state_dim,)),
                        layers.Dense(128, activation='relu', kernel_initializer=initializer),
                        layers.Dense(128, activation='relu'),
                        layers.Dropout(.1),
                        layers.Dense(action_dim)])
    return model

In [50]:
def e_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.uniform(action_space_low, action_space_high)
    else:
        state = np.expand_dims(state, 0)
        y = online_model(state).numpy()[0][0]
        # np.argmax(action_values)
        return y


In [51]:
def q_target_val(batch):
    mb_state, mb_next_state, mb_action, mb_reward, mb_done = batch
    ys = []
    for r, d, next_state in zip(mb_reward, mb_done, mb_next_state):
        if d == True:
            ys.append(r)
        else:
            next_state = np.expand_dims(next_state, 0)
            av = target_model(next_state)
            q_step = r + gamma*av
            ys.append(q_step)
    return ys


In [40]:
'''
def test_agent(env, ??, episodes=10):
    for episode in range(episodes):

        state = env.reset()
        done = False
        total_reward = 0

        while not done:

            action = e_greedy(epsilon)
            next_state, reward, done = env.step(action)
            buffer.add(state, next_state, action, reward, done)
            
            total_reward += reward
            state = next_state

        reward_list.append(total_reward)
    return reward_list
    '''

'\ndef test_agent(env, ??, episodes=10):\n    for episode in range(episodes):\n\n        state = env.reset()\n        done = False\n        total_reward = 0\n\n        while not done:\n\n            action = e_greedy(epsilon)\n            next_state, reward, done = env.step(action)\n            buffer.add(state, next_state, action, reward, done)\n            \n            total_reward += reward\n            state = next_state\n\n        reward_list.append(total_reward)\n    return reward_list\n    '

In [58]:
env = Train()
state_dim = env.state_dim
action_dim = env.action_dim
action_space_low = env.action_space_low
action_space_high = env.action_space_high
buffer_size = 1000
batch_size = 10
episodes = 10
epsilon = 0.1
gamma = 0.3
end_explor = 1
eps_decay = 0.001
min_b_size = 10
update_freq = 20
learning_rate = 0.001
start_explor = 1
end_explor = 0.4
explor_steps = 10
num_epochs = 1000
update_target_net = 2
reward_list = []
initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
opt = tf.keras.optimizers.Adam(learning_rate)

In [56]:
def optimize(model, inputs, outputs):
    loss_fn = lambda: tf.keras.losses.mse(model(input), output)
    var_list_fn = lambda: model.trainable_weights
    for input, output in zip(inputs, outputs):
        opt.minimize(loss_fn, var_list_fn)

In [59]:
online_model = actor_model(state_dim, action_dim, initializer)
target_model = actor_model(state_dim, action_dim, initializer)
buffer = Buffer(buffer_size=buffer_size)

In [60]:
step_count = 0
last_update_loss = []
batch_reward = []
old_step_count = 0

state = env.reset()
buffer = Buffer(buffer_size=buffer_size)
initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
online_model = actor_model(state_dim, action_dim, initializer)
target_model = actor_model(state_dim, action_dim, initializer)
epsilon = start_explor
eps_decay = (start_explor - end_explor) / explor_steps

for epoch in range(num_epochs):

    total_reward = 0
    done = False

    while not done:
        if epsilon > end_explor:
            epsilon -= eps_decay

        action = e_greedy(state, epsilon)
        next_state, reward, done = env.step(action)
        buffer.add(state, next_state, action, reward, done)
        state = next_state
        total_reward += reward
        step_count += 1

        if (len(buffer) > min_b_size) and (step_count % update_freq == 0):
            batch = buffer.sample(batch_size)
            mb_action = batch[2]
            y_r = q_target_val(batch)
            train_loss = optimize(online_model, y_r, mb_action)
            last_update_loss(train_loss)
        
        if (len(buffer) > min_b_size) and (step_count % update_target_net == 0):
            target_model.set_weights(online_model.get_weights())

        if done:
            state = env.reset()
            batch_reward.append(total_reward)
            total_reward = 0


ValueError: ignored