In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
from tensorflow.keras import layers
import time 

from vehicle_model_DDPG2 import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_e-4wd_Battery.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_id_75_110_Westinghouse.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 10)

num_states = 4

In [3]:
class OUActionNoise: 
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None): 
        self.theta = theta 
        self.mean = mean 
        self.std_dev = std_deviation 
        self.dt = dt 
        self.x_initial = x_initial 
        self.reset() 
        
    def reset(self): 
        if self.x_initial is not None: 
            self.x_prev = self.x_initial 
        else: 
            self.x_prev = 0 
            
    def __call__(self): 
        x = (
             self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt 
            + self.std_dev * np.sqrt(self.dt) * np.random.normal() 
        )
        self.x_prev = x 
        return x 

In [4]:
class Buffer: 
    def __init__(self, buffer_capacity=100000, batch_size=64): 
        self.power_mean = 0 
        self.power_std = 0
        self.sum = 0 
        self.sum_deviation = 0 
        self.N = 0 
        
        self.buffer_capacity = buffer_capacity 
        self.batch_size = batch_size 
        self.buffer_counter = 0 
        
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, 1))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        
    def record(self, obs_tuple):
        self.N += 1 
        index = self.buffer_counter % self.buffer_capacity 
        power = obs_tuple[0][0] 
        
        self.sum += power 
        self.power_mean = self.sum / self.N 
        self.sum_deviation += (power - self.power_mean) ** 2  
        self.power_std = np.sqrt(self.sum_deviation / self.N) 
            
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        
        self.buffer_counter += 1 
        
    def learn(self): 
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)
        
        state_batch = self.state_buffer[batch_indices]
        power_batch = (state_batch[:, 0] - self.power_mean) / self.power_std
        state_batch[:, 0] = power_batch 
        
        next_state_batch = self.next_state_buffer[batch_indices]
        power_batch = (next_state_batch[:, 0] - self.power_mean) / self.power_std
        next_state_batch[:, 0] = power_batch 
#         print(state_batch)
        
        state_batch = tf.convert_to_tensor(state_batch)
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(next_state_batch)
        
        with tf.GradientTape() as tape: 
            target_actions = target_actor(next_state_batch)
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions])
            critic_value = critic_model([state_batch, action_batch])
            critic_loss = tf.math.reduce_mean(tf.square(y - critic_value)) 
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables) 
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )
        
        with tf.GradientTape() as tape: 
            actions = actor_model(state_batch)
            critic_value = critic_model([state_batch, actions])
            actor_loss = - tf.math.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables) 
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )
        

In [5]:
def update_target(tau): 
    new_weights = [] 
    target_variables = target_critic.weights
    for i, variable in enumerate(critic_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_critic.set_weights(new_weights)
    
    new_weights = [] 
    target_variables = target_actor.weights
    for i, variable in enumerate(actor_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_actor.set_weights(new_weights)
    

In [6]:
def get_actor(): 
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    
    inputs = layers.Input(shape=(num_states))
    out = layers.Dense(512, activation="relu")(inputs)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1, activation="sigmoid", 
                          kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [7]:
def get_critic(): 
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
#     state_out = layers.BatchNormalization()(state_out)
    state_out = layers.Dense(32, activation="relu")(state_input)
#     state_out = layers.BatchNormalization()(state_out)
    
    action_input = layers.Input(shape=(1))
    action_out = layers.Dense(32, activation="relu")(action_input)
#     action_out = layers.BatchNormalization()(action_out)
    
    concat = layers.Concatenate()([state_out, action_out]) 
    
    out = layers.Dense(512, activation="relu")(concat)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1)(out)
    
    model = tf.keras.Model([state_input, action_input], outputs)
    return model 
    

In [8]:
def policy(state, noise_object): 
    j_min = state[0][2].numpy()
    j_max = state[0][3].numpy()
    sampled_action = tf.squeeze(actor_model(state)) 
    noise = noise_object()
    sampled_action = sampled_action.numpy() + noise 
    legal_action = sampled_action * j_max 
    legal_action = np.clip(legal_action, j_min, j_max)
#     print(j_min, j_max, legal_action, noise)
    return legal_action 
    

In [9]:
def policy_epsilon_greedy(state, eps): 
    j_min = state[0][-2].numpy()
    j_max = state[0][-1].numpy()

    if random.random() < eps: 
        a = random.randint(0, 9)
        return np.linspace(j_min, j_max, 10)[a]
    else: 
        sampled_action = tf.squeeze(actor_model(state)).numpy()  
        legal_action = sampled_action * j_max 
        legal_action = np.clip(legal_action, j_min, j_max)
        return legal_action

In [10]:
std_dev = 0.2 
ou_noise = OUActionNoise(mean=0, std_deviation=0.2)

critic_lr = 0.0005 
actor_lr = 0.00025 
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 200
gamma = 0.95 
tau = 0.001 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
DELAY_TRAINING = 3000 

In [11]:
def initialization(reward_factor): 
    actor_model = get_actor() 
    critic_model = get_critic() 

    target_actor = get_actor() 
    target_critic = get_critic() 
    target_actor.set_weights(actor_model.get_weights())
    target_critic.set_weights(critic_model.get_weights())
    
    buffer = Buffer(500000, BATCH_SIZE)
    
    env = Environment(cell_model, drving_cycle, battery_path, motor_path, reward_factor)
    return actor_model, critic_model, target_actor, target_critic, buffer, env

In [12]:
def save_weights(actor_model, critic_model, target_actor, target_critic, root): 
    actor_model.save_weights("./{}/actor_model_checkpoint".format(root))
    critic_model.save_weights("./{}/critic_model_checkpoint".format(root))
    target_actor.save_weights("./{}/target_actor_checkpoint".format(root))
    target_critic.save_weights("./{}/target_critic_checkpoint".format(root))
    print("model is saved..")

In [None]:
print(env.version)

num_trials = 5
reward_factor = 10
results_dict = {} 
for trial in range(num_trials): 
    print()
    print("Trial {}".format(trial + 1))
    
    actor_model, critic_model, target_actor, target_critic, buffer, env = initialization(
        reward_factor
    )
    
    eps = MAX_EPSILON 
    steps = 0
    
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    for ep in range(total_episodes): 
        start = time.time() 
        state = env.reset() 
        episodic_reward = 0 

        while True: 
            tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
            action = policy_epsilon_greedy(tf_state, eps)
    #         print(action)
            next_state, reward, done = env.step(action)
            if done: 
                next_state = [0] * num_states 

            buffer.record((state, action, reward, next_state))
            episodic_reward += reward 

            if steps > DELAY_TRAINING: 
                buffer.learn() 
                update_target(tau)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)

            steps += 1

            if done: 
                break 

            state = next_state 

        elapsed_time = time.time() - start 
        print("elapsed_time: {:.3f}".format(elapsed_time))
        episode_rewards.append(episodic_reward) 
        episode_SOCs.append(env.SOC)
        episode_FCs.append(env.fuel_consumption) 

    #     print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
        SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
        print(
              'Episode: {}'.format(ep + 1),
              "Exploration P: {:.4f}".format(eps),
              'Total reward: {}'.format(episodic_reward), 
              "SOC: {:.4f}".format(env.SOC), 
              "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
              "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
        )
    
    root = "DDPG2_trial{}".format(trial+1)
    save_weights(actor_model, critic_model, target_actor, target_critic, root)
    
    results_dict[trial + 1] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs, 
        "mean": buffer.power_mean, 
        "std": buffer.power_std, 
    }

2

Trial 1
maximum steps, simulation is done ... 
elapsed_time: 17.817
Episode: 1 Exploration P: 1.0000 Total reward: -995.6136010468043 SOC: 0.8193 Cumulative_SOC_deviation: 93.3047 Fuel Consumption: 62.5670
maximum steps, simulation is done ... 
elapsed_time: 16.736
Episode: 2 Exploration P: 1.0000 Total reward: -841.1839912199556 SOC: 0.7747 Cumulative_SOC_deviation: 78.2230 Fuel Consumption: 58.9540


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer const

maximum steps, simulation is done ... 
elapsed_time: 57.492
Episode: 25 Exploration P: 0.5083 Total reward: -662.8688534301347 SOC: 0.6905 Cumulative_SOC_deviation: 61.0669 Fuel Consumption: 52.1995
maximum steps, simulation is done ... 
elapsed_time: 57.567
Episode: 26 Exploration P: 0.4948 Total reward: -614.683984028929 SOC: 0.6794 Cumulative_SOC_deviation: 56.3277 Fuel Consumption: 51.4068
maximum steps, simulation is done ... 
elapsed_time: 57.829
Episode: 27 Exploration P: 0.4817 Total reward: -629.9066508487747 SOC: 0.6824 Cumulative_SOC_deviation: 57.8358 Fuel Consumption: 51.5485
maximum steps, simulation is done ... 
elapsed_time: 57.212
Episode: 28 Exploration P: 0.4689 Total reward: -621.313153270886 SOC: 0.6838 Cumulative_SOC_deviation: 56.9605 Fuel Consumption: 51.7080
maximum steps, simulation is done ... 
elapsed_time: 57.168
Episode: 29 Exploration P: 0.4565 Total reward: -539.1016307825864 SOC: 0.6688 Cumulative_SOC_deviation: 48.8538 Fuel Consumption: 50.5638
maximum

maximum steps, simulation is done ... 
elapsed_time: 62.989
Episode: 67 Exploration P: 0.1673 Total reward: -300.31572096466715 SOC: 0.6261 Cumulative_SOC_deviation: 25.3563 Fuel Consumption: 46.7526
maximum steps, simulation is done ... 
elapsed_time: 60.382
Episode: 68 Exploration P: 0.1630 Total reward: -282.9284324728633 SOC: 0.6170 Cumulative_SOC_deviation: 23.6775 Fuel Consumption: 46.1532
maximum steps, simulation is done ... 
elapsed_time: 59.605
Episode: 69 Exploration P: 0.1589 Total reward: -319.26287747670335 SOC: 0.6148 Cumulative_SOC_deviation: 27.3318 Fuel Consumption: 45.9453
maximum steps, simulation is done ... 
elapsed_time: 59.692
Episode: 70 Exploration P: 0.1548 Total reward: -325.81370467341566 SOC: 0.6201 Cumulative_SOC_deviation: 27.9256 Fuel Consumption: 46.5578
maximum steps, simulation is done ... 
elapsed_time: 59.630
Episode: 71 Exploration P: 0.1509 Total reward: -297.6322389887327 SOC: 0.6200 Cumulative_SOC_deviation: 25.1256 Fuel Consumption: 46.3761
ma

maximum steps, simulation is done ... 
elapsed_time: 57.227
Episode: 109 Exploration P: 0.0596 Total reward: -260.3270441987932 SOC: 0.6088 Cumulative_SOC_deviation: 21.4924 Fuel Consumption: 45.4035
maximum steps, simulation is done ... 
elapsed_time: 57.059
Episode: 110 Exploration P: 0.0583 Total reward: -250.7905200794421 SOC: 0.6141 Cumulative_SOC_deviation: 20.4562 Fuel Consumption: 46.2283
maximum steps, simulation is done ... 
elapsed_time: 57.297
Episode: 111 Exploration P: 0.0570 Total reward: -253.07637710119425 SOC: 0.6104 Cumulative_SOC_deviation: 20.7258 Fuel Consumption: 45.8186
maximum steps, simulation is done ... 
elapsed_time: 57.435
Episode: 112 Exploration P: 0.0557 Total reward: -265.3346768084512 SOC: 0.6126 Cumulative_SOC_deviation: 21.9647 Fuel Consumption: 45.6882
maximum steps, simulation is done ... 
elapsed_time: 57.229
Episode: 113 Exploration P: 0.0545 Total reward: -264.01710481482615 SOC: 0.6169 Cumulative_SOC_deviation: 21.7840 Fuel Consumption: 46.177

maximum steps, simulation is done ... 
elapsed_time: 69.369
Episode: 150 Exploration P: 0.0261 Total reward: -243.76683057004723 SOC: 0.6085 Cumulative_SOC_deviation: 19.8095 Fuel Consumption: 45.6718
maximum steps, simulation is done ... 
elapsed_time: 60.134
Episode: 151 Exploration P: 0.0257 Total reward: -233.86531013096402 SOC: 0.6095 Cumulative_SOC_deviation: 18.8164 Fuel Consumption: 45.7016
maximum steps, simulation is done ... 
elapsed_time: 61.553
Episode: 152 Exploration P: 0.0252 Total reward: -226.0000416406831 SOC: 0.6052 Cumulative_SOC_deviation: 18.0641 Fuel Consumption: 45.3589
maximum steps, simulation is done ... 
elapsed_time: 133.355
Episode: 153 Exploration P: 0.0248 Total reward: -240.2145847786676 SOC: 0.6149 Cumulative_SOC_deviation: 19.4011 Fuel Consumption: 46.2033
maximum steps, simulation is done ... 
elapsed_time: 74.546
Episode: 154 Exploration P: 0.0244 Total reward: -224.07862369633608 SOC: 0.6105 Cumulative_SOC_deviation: 17.8271 Fuel Consumption: 45.8

maximum steps, simulation is done ... 
elapsed_time: 63.850
Episode: 191 Exploration P: 0.0152 Total reward: -274.23324741788076 SOC: 0.6163 Cumulative_SOC_deviation: 22.7889 Fuel Consumption: 46.3440
maximum steps, simulation is done ... 
elapsed_time: 59.645
Episode: 192 Exploration P: 0.0151 Total reward: -272.57726259271897 SOC: 0.6162 Cumulative_SOC_deviation: 22.6280 Fuel Consumption: 46.2974
maximum steps, simulation is done ... 
elapsed_time: 82.393
Episode: 193 Exploration P: 0.0149 Total reward: -255.88724524653202 SOC: 0.6048 Cumulative_SOC_deviation: 21.0488 Fuel Consumption: 45.3993
maximum steps, simulation is done ... 
elapsed_time: 66.051
Episode: 194 Exploration P: 0.0148 Total reward: -262.5482261653755 SOC: 0.6156 Cumulative_SOC_deviation: 21.6306 Fuel Consumption: 46.2420
maximum steps, simulation is done ... 
elapsed_time: 62.725
Episode: 195 Exploration P: 0.0147 Total reward: -287.5743264748975 SOC: 0.6139 Cumulative_SOC_deviation: 24.1372 Fuel Consumption: 46.20

maximum steps, simulation is done ... 
elapsed_time: 57.497
Episode: 15 Exploration P: 0.6658 Total reward: -1508.6381363405021 SOC: 0.5448 Cumulative_SOC_deviation: 146.7309 Fuel Consumption: 41.3289
maximum steps, simulation is done ... 
elapsed_time: 57.497
Episode: 16 Exploration P: 0.6480 Total reward: -1737.214748676926 SOC: 0.5061 Cumulative_SOC_deviation: 169.8934 Fuel Consumption: 38.2809
maximum steps, simulation is done ... 
elapsed_time: 57.295
Episode: 17 Exploration P: 0.6307 Total reward: -1777.3551487615557 SOC: 0.5038 Cumulative_SOC_deviation: 173.9065 Fuel Consumption: 38.2905
maximum steps, simulation is done ... 
elapsed_time: 57.479
Episode: 18 Exploration P: 0.6139 Total reward: -1730.0022011802205 SOC: 0.5038 Cumulative_SOC_deviation: 169.1764 Fuel Consumption: 38.2381
maximum steps, simulation is done ... 
elapsed_time: 57.657
Episode: 19 Exploration P: 0.5976 Total reward: -1897.3216008268423 SOC: 0.4854 Cumulative_SOC_deviation: 186.0389 Fuel Consumption: 36.9

maximum steps, simulation is done ... 
elapsed_time: 58.466
Episode: 56 Exploration P: 0.2227 Total reward: -403.20223935517436 SOC: 0.6330 Cumulative_SOC_deviation: 35.6040 Fuel Consumption: 47.1622
maximum steps, simulation is done ... 
elapsed_time: 58.980
Episode: 57 Exploration P: 0.2170 Total reward: -375.5482816300528 SOC: 0.6312 Cumulative_SOC_deviation: 32.8529 Fuel Consumption: 47.0197
maximum steps, simulation is done ... 
elapsed_time: 70.110
Episode: 58 Exploration P: 0.2114 Total reward: -396.9463383042171 SOC: 0.6367 Cumulative_SOC_deviation: 34.9506 Fuel Consumption: 47.4400
maximum steps, simulation is done ... 
elapsed_time: 68.662
Episode: 59 Exploration P: 0.2059 Total reward: -397.7018625067235 SOC: 0.6368 Cumulative_SOC_deviation: 35.0183 Fuel Consumption: 47.5191
maximum steps, simulation is done ... 
elapsed_time: 68.351
Episode: 60 Exploration P: 0.2006 Total reward: -373.76657903758115 SOC: 0.6315 Cumulative_SOC_deviation: 32.6780 Fuel Consumption: 46.9871
max

In [None]:
with open("DDPG2.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)