In [9]:
import numpy as np
import pandas as pd
import random
import tensorflow.compat.v1 as tf
from tensorflow.contrib import rnn
from tensorflow import nn
tf.disable_v2_behavior()

np.random.seed(1)
tf.set_random_seed(1)

In [12]:
# 数据处理
class DataProcess:
    def __init__(self, data_path):
        self.data_path = data_path
        # 分割数据
        self.data_split()
    
    def data_split(self):
        data_pd = pd.read_csv(self.data_path, encoding='gbk')
        data_pd = data_pd.iloc[:, 1:]
        data_pd = data_pd.rename({'实际时间':'actual_time','速度':'speed',
                              '延迟时间':'delay_time','行程时间':'road_time',
                              '气温':'temperature','降水概率':'rain_prob'}, axis=1)
        if not hasattr(self, 'trian_pd'):
            self.trian_pd = data_pd.loc[:1500,:]
        if not hasattr(self, 'test_pd'):
            self.test_pd = data_pd.loc[1500:,:]
            
class TrafficEnvironment(DataProcess):
    def __init__(self, data_path):
        DataProcess.__init__(self, data_path=data_path)
        self.reset_mode(mode='train')
    
    # 环境模式
    def reset_mode(self, mode):
        if not hasattr(self, 'mode'):
            self.mode = mode
        self.mode = mode
    
    # 环境交互函数
    # 输入状态s、行为，输出s,a,reward和s_,如果找不到满足条件的reward和s_则返回空[]
    def env_backward(self, s, a):
        check_s_ = self.check_action(s, a)
        if len(check_s_) == 0:
            return []
        s_ = random.choice(check_s_)
        reward = s[1] - s_[1]
        return s,a,reward,s_
    
    # 检查行为是否合法,合法则返回[s_1,s_2...],不合法则返回[]
    def check_action(self, s, a):
        data_pd = self.trian_pd if self.mode=='trian' else self.test_pd
        action_feature = [0 for _ in range(data_pd.shape[1])]
        
        decoded_str = bin(a).replace('0b','')
        for val in decoded_str:
            val_int = int(val)
            action_feature.append(val_int)
        # 表征行为的向量
        action_vector = action_feature[-self.trian_pd.shape[1]:]
        col_labels = self.trian_pd.columns.tolist()
        # 按照条件逐步筛选s_
        selected_pd = data_pd
        for filter_index in range(len(action_vector)):
            if action_vector[filter_index] == 1:
                selected_pd = selected_pd.loc[selected_pd[col_labels[filter_index]] >= s[filter_index]]
            else:
                selected_pd = selected_pd.loc[selected_pd[col_labels[filter_index]] < s[filter_index]]
            if selected_pd.shape[0] == 0:
                return []
        s_next_container = selected_pd.get_values().tolist()
        return s_next_container
        
    
class DeepRecurrentQNetwork:
    def __init__(self,n_features,learning_rate=0.01,reward_decay=0.9,
                 e_greedy=0.9,replace_target_iter=300,memory_size=500,
                 batch_size=32, e_greedy_increment=None,output_graph=False,):
        self.n_features = n_features # 表示状态s的特征
        self.n_actions = 2^self.n_features  # 可选行为的个数取决于s的特征长度
        
        self.lr = learning_rate  # 学习率
        self.gamma = reward_decay  # reward衰减的系数
        self.epsilon_max = e_greedy  # epsilon最大值
        self.replace_target_iter = replace_target_iter  # 每嗝多少次训练就轮换一下网络的参数
        self.memory_size = memory_size    # 记忆库最大容量
        self.batch_size = batch_size   # 训练的batchsize
        self.epsilon_increment = e_greedy_increment  # epsilon增加量，随着训练的迭代，逐步减少探索性
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max  

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self.lstm_units = 10
        self.build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        if output_graph:
            # $ tensorboard --logdir=logs
            # tf.train.SummaryWriter soon be deprecated, use following
            tf.summary.FileWriter("logs\\", self.sess.graph)
            
        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []
        
    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # 更新记忆，替换
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1
        
    def choose_action(self, s):
        valid_actions = []
        for action in range(self.n_actions):
            env_backward = env.check_action(s, action)
            if len(env_backward) != 0:
                valid_actions.append(action)
        # 任何一个状态s不可能没有行为可以选
        # epsilon的概率从valid_actions中选择最优的那个
        # 1-epsilon的概率valid_actions中随机选择一个行为
        s = s[np.newaxis, :]
        rand_point = np.random.uniform()
        if rand_point < self.epsilon:
            # 从valid_actions中选择最优的那个
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: s})
            action = actions_value[0]
            for action_ in actions_value:
                action = action_ if actions_value[action_] > actions_value[action] else action
            return action
        else:
            action = np.random.randint(0, self.n_actions)
        return action
        
        
        pass
    
    # 搭建神经网络
    def build_net(self):
        step = 1
        # ------------------ build evaluate_net ------------------
        self.s = tf.placeholder(tf.float32, [None, step, self.n_features], name='s')
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
        
        with tf.variable_scope('eval_net'):
            c_names, n_l1, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
            with tf.variable_scope('layer_1'):
                lstm_cell = rnn.BasicLSTMCell(self.lstm_units)
                init_state = lstm_cell.zero_state(step, dtype=tf.float32)
                rnn_out, final_state = tf.nn.dynamic_rnn(lstm_cell, self.s, initial_state=init_state)
                self.q_eval = rnn_out
                
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
        # ------------------ build target_net ------------------
        self.s_ = tf.placeholder(tf.float32, [None, step, self.n_features], name='s_')    # input 
        with tf.variable_scope('target_net'):
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            with tf.variable_scope('layer_1'):
                lstm_cell = rnn.BasicLSTMCell(self.lstm_units)
                init_state = lstm_cell.zero_state(step, dtype=tf.float32)
                rnn_out, final_state = tf.nn.dynamic_rnn(lstm_cell, self.s, initial_state=init_state)
                self.q_next = rnn_out

In [13]:
data_path = r'E:\PersonalProjects\ReinforcementLearning\DRQN_traffic\SourceData\TrafficData.csv'

env = TrafficEnvironment(data_path)



rl_network = DeepRecurrentQNetwork(
                        n_features=5,
                        learning_rate=0.01,
                        reward_decay=0.9,
                        e_greedy=0.9,
                        replace_target_iter=200,
                        memory_size=2000,
                        output_graph=True)

Instructions for updating:
Colocations handled automatically by placer.


ValueError: Dimensions must be equal, but are 7 and 10 for 'loss/SquaredDifference' (op: 'SquaredDifference') with input shapes: [?,7], [1,1,10].

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[[42.6, 9.1, 23.2, 24.0, 0.07], [42.6, 9.1, 23.2, 24.0, 0.07], [37.1, 12.6, 26.6, 24.0, 0.07], [37.1, 12.6, 26.6, 24.0, 0.07], [42.5, 9.2, 23.3, 24.0, 0.06], [42.5, 9.2, 23.3, 24.0, 0.06], [47.2, 6.8, 20.9, 24.0, 0.06], [47.2, 6.8, 20.9, 24.0, 0.06], [51.5, 5.1, 19.2, 24.0, 0.06], [51.5, 5.1, 19.2, 24.0, 0.06], [51.4, 5.2, 19.2, 24.0, 0.08], [51.4, 5.2, 19.2, 24.0, 0.08], [50.1, 5.6, 19.7, 24.0, 0.08], [50.1, 5.6, 19.7, 24.0, 0.08], [51.2, 5.2, 19.3, 24.0, 0.08], [51.2, 5.2, 19.3, 24.0, 0.08], [50.7, 5.4, 19.5, 24.0, 0.08], [50.7, 5.4, 19.5, 24.0, 0.08], [52.1, 4.9, 19.0, 24.0, 0.08], [52.1, 4.9, 19.0, 25.0, 0.46], [50.5, 5.5, 19.6, 26.0, 0.54], [50.5, 5.5, 19.6, 26.0, 0.54], [44.7, 8.0, 22.1, 26.0, 0.54], [44.7, 8.0, 22.1, 26.0, 0.54], [41.6, 9.7, 23.8, 26.0, 0.54], [41.6, 9.7, 23.8, 26.0, 0.54], [46.1, 7.4, 21.5, 26.0, 0.54], [46.1, 7.4, 21.5, 26.0, 0.54], [47.4, 6.8, 20.9, 26.0, 0.54], [47.4, 6.8, 20.9, 26.0, 0.54], [48.8, 6.2, 20.3, 26.0



In [35]:
index_list = [2,5]
val_list = [1,2,3,4,5,6,7,8]

max_index = index_list[0]
for index in index_list:
    max_index = index if val_list[index] > val_list[max_index] else max_index






[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
([57.2, 3.2, 17.3, 25.0, 0.05], 13, -2.3999999999999995, [50.1, 5.6, 19.7, 24.0, 0.08])
[]
([57.2, 3.2, 17.3, 25.0, 0.05], 15, -0.19999999999999973, [56.4, 3.4, 17.5, 25.0, 0.07])
[]
([57.2, 3.2, 17.3, 25.0, 0.05], 17, 1.6, [63.1, 1.6, 15.7, 24.0, 0.05])
[]
([57.2, 3.2, 17.3, 25.0, 0.05], 19, 0.7000000000000002, [59.6, 2.5, 16.6, 27.0, 0.18])
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
([57.2, 3.2, 17.3, 25.0, 0.05], 31, 0.0, [57.2, 3.2, 17.3, 25.0, 0.07])




[61.3, 2.0, 16.1, 21.0, 0.07]