In [5]:
import numpy as np
import pandas as pd
import random
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

np.random.seed(1)
tf.set_random_seed(1)

In [6]:
# 数据处理
class DataProcess:
    def __init__(self, data_path):
        self.data_path = data_path
        # 分割数据
        self.data_split()
    
    def data_split(self):
        data_pd = pd.read_csv(self.data_path, encoding='gbk')
        data_pd = data_pd.iloc[:, 1:]
        data_pd = data_pd.rename({'实际时间':'actual_time','速度':'speed',
                              '延迟时间':'delay_time','行程时间':'road_time',
                              '气温':'temperature','降水概率':'rain_prob'}, axis=1)
        if not hasattr(self, 'trian_pd'):
            self.trian_pd = data_pd.loc[:1500,:]
        if not hasattr(self, 'test_pd'):
            self.test_pd = data_pd.loc[1500:,:]
            
class TrafficEnvironment(DataProcess):
    def __init__(self, data_path):
        
        DataProcess.__init__(self, data_path=data_path)
    
    # 环境模式
    def reset_mode(self, mode='train'):
        if not hasattr(self, 'mode'):
            self.mode = mode
        self.mode = mode
    
    # 将行为转化成0-1字符串
    def action_decode(self, a):
        
        pass
    
    # 环境交互函数
    # 输入状态s、行为，输出s,a,reward和s_,如果找不到满足条件的reward和s_则返回空[]
    def env_backward(self, s, a):
        # 获取数据池
        data_pd = self.trian_pd if self.mode=='trian' else self.test_pd
        action_feature = [0 for _ in range(data_pd.shape[1])]
        
        decoded_str = bin(a).replace('0b','')
        for val in decoded_str:
            val_int = int(val)
            action_feature.append(val_int)
        # 表征行为的向量
        action_vector = action_feature[-self.trian_pd.shape[1]:]
        col_labels = self.trian_pd.columns.tolist()
        # 按照条件逐步筛选s_
        selected_pd = data_pd
        for filter_index in range(len(action_vector)):
            if action_vector[filter_index] == 1:
                selected_pd = selected_pd.loc[selected_pd[col_labels[filter_index]] >= s[filter_index]]
            else:
                selected_pd = selected_pd.loc[selected_pd[col_labels[filter_index]] < s[filter_index]]
            if selected_pd.shape[0] == 0:
                return []
        s_ = random.choice(selected_pd.get_values()).tolist()
        reward = s[1] - s_[1]
        return s,a,reward,s_
    
    

In [8]:
data_path = r'E:\PersonalProjects\ReinforcementLearning\DRQN_traffic\SourceData\TrafficData.csv'

env = TrafficEnvironment(data_path)




Unnamed: 0,speed,delay_time,road_time,temperature,rain_prob
0,61.0,2.1,16.2,18,0.20
1,61.0,2.1,16.2,18,0.20
2,62.7,1.7,15.8,18,0.20
3,62.7,1.7,15.8,18,0.20
4,61.7,1.9,16.0,18,0.20
...,...,...,...,...,...
1496,62.2,1.8,15.9,24,0.07
1497,62.2,1.8,15.9,24,0.07
1498,58.1,2.9,17.0,24,0.07
1499,58.1,2.9,17.0,24,0.07


In [9]:
class DeepRecurrentQNetwork:
    def __init__(self,n_features,learning_rate=0.01,reward_decay=0.9,
                 e_greedy=0.9,replace_target_iter=300,memory_size=500,
                 batch_size=32, e_greedy_increment=None,output_graph=False,):
        self.n_features = n_features # 表示状态s的特征
        self.n_actions = 2^len(self.n_features)  # 可选行为的个数取决于s的特征长度
        
        self.lr = learning_rate  # 学习率
        self.gamma = reward_decay  # reward衰减的系数
        self.epsilon_max = e_greedy  # epsilon最大值
        self.replace_target_iter = replace_target_iter  # 每嗝多少次训练就轮换一下网络的参数
        self.memory_size = memory_size    # 记忆库最大容量
        self.batch_size = batch_size   # 训练的batchsize
        self.epsilon_increment = e_greedy_increment  # epsilon增加量，随着训练的迭代，逐步减少探索性
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max  

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        # self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        if output_graph:
            # $ tensorboard --logdir=logs
            # tf.train.SummaryWriter soon be deprecated, use following
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []

Unnamed: 0,speed,delay_time,road_time,temperature,rain_prob
0,61.0,2.1,16.2,18,0.20
1,61.0,2.1,16.2,18,0.20
2,62.7,1.7,15.8,18,0.20
3,62.7,1.7,15.8,18,0.20
4,61.7,1.9,16.0,18,0.20
5,61.7,1.9,16.0,18,0.20
6,62.3,1.8,15.9,18,0.20
7,62.3,1.8,15.9,18,0.20
8,64.0,1.4,15.4,18,0.20
9,64.0,1.4,15.4,18,0.20


  


[58.6, 2.8, 16.9, 23.0, 0.06]