In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
from tensorflow.keras import layers
import time 

from vehicle_model_DDPG12_2 import Environment 
from cell_model import CellModel 

# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_e-4wd_Battery.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_id_75_110_Westinghouse.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 10)

num_states = 4

In [3]:
class OUActionNoise: 
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None): 
        self.theta = theta 
        self.mean = mean 
        self.std_dev = std_deviation 
        self.dt = dt 
        self.x_initial = x_initial 
        self.reset() 
        
    def reset(self): 
        if self.x_initial is not None: 
            self.x_prev = self.x_initial 
        else: 
            self.x_prev = 0 
            
    def __call__(self): 
        x = (
             self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt 
            + self.std_dev * np.sqrt(self.dt) * np.random.normal() 
        )
        self.x_prev = x 
        return x 

In [4]:
class Buffer: 
    def __init__(self, buffer_capacity=100000, batch_size=64):      
        self.buffer_capacity = buffer_capacity 
        self.batch_size = batch_size 
        self.buffer_counter = 0 
        
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, 1))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        
    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity 

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        
        self.buffer_counter += 1 
        
    def learn(self): 
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        
        with tf.GradientTape() as tape: 
            target_actions = target_actor(next_state_batch)
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions])
            critic_value = critic_model([state_batch, action_batch])
            critic_loss = tf.math.reduce_mean(tf.square(y - critic_value)) 
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables) 
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )
        
        with tf.GradientTape() as tape: 
            actions = actor_model(state_batch)
            critic_value = critic_model([state_batch, actions])
            actor_loss = - tf.math.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables) 
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )
        

In [5]:
def update_target(tau): 
    new_weights = [] 
    target_variables = target_critic.weights
    for i, variable in enumerate(critic_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_critic.set_weights(new_weights)
    
    new_weights = [] 
    target_variables = target_actor.weights
    for i, variable in enumerate(actor_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_actor.set_weights(new_weights)
    

In [6]:
def get_actor(): 
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    
    inputs = layers.Input(shape=(num_states))
    inputs_batchnorm = layers.BatchNormalization()(inputs)
    
    out = layers.Dense(512, activation="relu")(inputs_batchnorm)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1, activation="sigmoid", 
                          kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [7]:
def get_critic(): 
    state_input = layers.Input(shape=(num_states))
    state_input_batchnorm = layers.BatchNormalization()(state_input)
    
    state_out = layers.Dense(16, activation="relu")(state_input_batchnorm)
#     state_out = layers.BatchNormalization()(state_out)
    state_out = layers.Dense(32, activation="relu")(state_out)
#     state_out = layers.BatchNormalization()(state_out)
    
    action_input = layers.Input(shape=(1))
    action_out = layers.Dense(32, activation="relu")(action_input)
#     action_out = layers.BatchNormalization()(action_out)
    
    concat = layers.Concatenate()([state_out, action_out]) 
    
    out = layers.Dense(512, activation="relu")(concat)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1)(out)
    
    model = tf.keras.Model([state_input, action_input], outputs)
    return model 
    

In [8]:
def policy(state, noise_object): 
    j_min = state[0][2].numpy()
    j_max = state[0][3].numpy()
    sampled_action = tf.squeeze(actor_model(state)) 
    noise = noise_object()
    sampled_action = sampled_action.numpy() + noise 
    legal_action = sampled_action * j_max 
    legal_action = np.clip(legal_action, j_min, j_max)
#     print(j_min, j_max, legal_action, noise)
    return legal_action 
    

In [9]:
def policy_epsilon_greedy(state, eps): 
    j_min = state[0][-2].numpy()
    j_max = state[0][-1].numpy()

    if random.random() < eps: 
        a = random.randint(0, 9)
        return np.linspace(j_min, j_max, 10)[a]
    else: 
        sampled_action = tf.squeeze(actor_model(state)).numpy()  
        legal_action = sampled_action * j_max 
        legal_action = np.clip(legal_action, j_min, j_max)
        return legal_action

In [10]:
std_dev = 0.2 
ou_noise = OUActionNoise(mean=0, std_deviation=0.2)

critic_lr = 0.0005 
actor_lr = 0.00025 
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 100
gamma = 0.95 
tau = 0.001 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
DELAY_TRAINING = 3000 

In [11]:
def initialization(): 
    actor_model = get_actor() 
    critic_model = get_critic() 

    target_actor = get_actor() 
    target_critic = get_critic() 
    target_actor.set_weights(actor_model.get_weights())
    target_critic.set_weights(critic_model.get_weights())
    
    buffer = Buffer(500000, BATCH_SIZE)
    return actor_model, critic_model, target_actor, target_critic, buffer

In [None]:
print(env.version)

num_trials = 10
results_dict = {} 
for trial in range(num_trials): 
    print()
    print("Trial {}".format(trial))
    
    actor_model, critic_model, target_actor, target_critic, buffer = initialization()
    
    eps = MAX_EPSILON 
    steps = 0
    
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    for ep in range(total_episodes): 
        start = time.time() 
        state = env.reset() 
        episodic_reward = 0 

        while True: 
            tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
            action = policy_epsilon_greedy(tf_state, eps)
    #         print(action)
            next_state, reward, done = env.step(action)
            if done: 
                next_state = [0] * num_states 

            buffer.record((state, action, reward, next_state))
            episodic_reward += reward 

            if steps > DELAY_TRAINING: 
                buffer.learn() 
                update_target(tau)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)

            steps += 1

            if done: 
                break 

            state = next_state 

        elapsed_time = time.time() - start 
        print("elapsed_time: {:.3f}".format(elapsed_time))
        episode_rewards.append(episodic_reward) 
        episode_SOCs.append(env.SOC)
        episode_FCs.append(env.fuel_consumption) 

    #     print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
        SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
        print(
              'Episode: {}'.format(ep + 1),
              "Exploration P: {:.4f}".format(eps),
              'Total reward: {}'.format(episodic_reward), 
              "SOC: {:.4f}".format(env.SOC), 
              "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
              "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
        )
    
    model.save_weights("./DDPG12_2_trial{}".format(trial+1))
    print("model is saved..")
    
    results_dict[trial + 1] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }

2
maximum steps, simulation is done ... 
elapsed_time: 28.732
Episode: 1 Exploration P: 1.0000 Total reward: -943.8410256002068 SOC: 0.7979 Cumulative_SOC_deviation: 88.3043 Fuel Consumption: 60.7985
maximum steps, simulation is done ... 
elapsed_time: 29.871
Episode: 2 Exploration P: 1.0000 Total reward: -1008.8101509087568 SOC: 0.8165 Cumulative_SOC_deviation: 94.6425 Fuel Consumption: 62.3851


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.


maximum steps, simulation is done ... 
elapsed_time: 215.789
Episode: 28 Exploration P: 0.4689 Total reward: -379.020813052396 SOC: 0.6343 Cumulative_SOC_deviation: 33.1313 Fuel Consumption: 47.7079
maximum steps, simulation is done ... 
elapsed_time: 216.578
Episode: 29 Exploration P: 0.4565 Total reward: -376.8070276904804 SOC: 0.6241 Cumulative_SOC_deviation: 32.9924 Fuel Consumption: 46.8826
maximum steps, simulation is done ... 
elapsed_time: 213.448
Episode: 30 Exploration P: 0.4444 Total reward: -386.3163411434918 SOC: 0.6361 Cumulative_SOC_deviation: 33.8621 Fuel Consumption: 47.6951
maximum steps, simulation is done ... 
elapsed_time: 216.094
Episode: 31 Exploration P: 0.4326 Total reward: -341.86125834531725 SOC: 0.6214 Cumulative_SOC_deviation: 29.5203 Fuel Consumption: 46.6585
maximum steps, simulation is done ... 
elapsed_time: 214.946
Episode: 32 Exploration P: 0.4212 Total reward: -359.49063836298205 SOC: 0.6184 Cumulative_SOC_deviation: 31.2840 Fuel Consumption: 46.6509

maximum steps, simulation is done ... 
elapsed_time: 214.565
Episode: 69 Exploration P: 0.1589 Total reward: -260.17069566120733 SOC: 0.6069 Cumulative_SOC_deviation: 21.4850 Fuel Consumption: 45.3212
maximum steps, simulation is done ... 
elapsed_time: 207.463
Episode: 70 Exploration P: 0.1548 Total reward: -229.3853677554245 SOC: 0.6037 Cumulative_SOC_deviation: 18.4101 Fuel Consumption: 45.2848
maximum steps, simulation is done ... 
elapsed_time: 204.853
Episode: 71 Exploration P: 0.1509 Total reward: -222.9677155807542 SOC: 0.6095 Cumulative_SOC_deviation: 17.7410 Fuel Consumption: 45.5579
maximum steps, simulation is done ... 
elapsed_time: 205.392
Episode: 72 Exploration P: 0.1471 Total reward: -230.52764715323892 SOC: 0.6020 Cumulative_SOC_deviation: 18.5472 Fuel Consumption: 45.0558
maximum steps, simulation is done ... 
elapsed_time: 206.184
Episode: 73 Exploration P: 0.1434 Total reward: -234.4263196405945 SOC: 0.6057 Cumulative_SOC_deviation: 18.8915 Fuel Consumption: 45.511



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

maximum steps, simulation is done ... 
elapsed_time: 99.143
Episode: 3 Exploration P: 0.9217 Total reward: -804.754974455981 SOC: 0.7511 Cumulative_SOC_deviation: 74.7633 Fuel Consumption: 57.1218
maximum steps, simulation is done ... 
elapsed_time: 116.188
Episode: 4 Exploration P: 0.8970 Total reward: -807.4095891315015 SOC: 0.7250 Cumulative_SOC_deviation: 75.2326 Fuel Consumption: 55.0832
ma

maximum steps, simulation is done ... 
elapsed_time: 118.239
Episode: 37 Exploration P: 0.3684 Total reward: -282.4113726620317 SOC: 0.6120 Cumulative_SOC_deviation: 23.6158 Fuel Consumption: 46.2532
maximum steps, simulation is done ... 
elapsed_time: 118.352
Episode: 38 Exploration P: 0.3587 Total reward: -283.532270480619 SOC: 0.6075 Cumulative_SOC_deviation: 23.7495 Fuel Consumption: 46.0368
maximum steps, simulation is done ... 
elapsed_time: 118.304
Episode: 39 Exploration P: 0.3493 Total reward: -279.62093144728277 SOC: 0.6157 Cumulative_SOC_deviation: 23.3037 Fuel Consumption: 46.5837
maximum steps, simulation is done ... 
elapsed_time: 118.435
Episode: 40 Exploration P: 0.3401 Total reward: -262.8484756687511 SOC: 0.6091 Cumulative_SOC_deviation: 21.6883 Fuel Consumption: 45.9653
maximum steps, simulation is done ... 
elapsed_time: 118.444
Episode: 41 Exploration P: 0.3311 Total reward: -266.973116971779 SOC: 0.6057 Cumulative_SOC_deviation: 22.1190 Fuel Consumption: 45.7836
m

maximum steps, simulation is done ... 
elapsed_time: 105.794
Episode: 78 Exploration P: 0.1263 Total reward: -257.66896870759336 SOC: 0.5966 Cumulative_SOC_deviation: 21.3180 Fuel Consumption: 44.4890
maximum steps, simulation is done ... 
elapsed_time: 105.623
Episode: 79 Exploration P: 0.1231 Total reward: -260.38182822186536 SOC: 0.5983 Cumulative_SOC_deviation: 21.5706 Fuel Consumption: 44.6754
maximum steps, simulation is done ... 
elapsed_time: 105.851
Episode: 80 Exploration P: 0.1200 Total reward: -255.6544072738537 SOC: 0.5979 Cumulative_SOC_deviation: 21.1050 Fuel Consumption: 44.6043
maximum steps, simulation is done ... 
elapsed_time: 105.867
Episode: 81 Exploration P: 0.1171 Total reward: -261.88868696667697 SOC: 0.5983 Cumulative_SOC_deviation: 21.7156 Fuel Consumption: 44.7327
maximum steps, simulation is done ... 
elapsed_time: 105.969
Episode: 82 Exploration P: 0.1142 Total reward: -259.87088134977347 SOC: 0.5958 Cumulative_SOC_deviation: 21.5427 Fuel Consumption: 44.4

maximum steps, simulation is done ... 
elapsed_time: 103.310
Episode: 5 Exploration P: 0.8730 Total reward: -852.1571816969532 SOC: 0.6935 Cumulative_SOC_deviation: 79.9478 Fuel Consumption: 52.6792
maximum steps, simulation is done ... 
elapsed_time: 103.504
Episode: 6 Exploration P: 0.8496 Total reward: -770.2019261841501 SOC: 0.7128 Cumulative_SOC_deviation: 71.6093 Fuel Consumption: 54.1086
maximum steps, simulation is done ... 
elapsed_time: 103.865
Episode: 7 Exploration P: 0.8269 Total reward: -837.0485488127127 SOC: 0.6822 Cumulative_SOC_deviation: 78.5302 Fuel Consumption: 51.7463
maximum steps, simulation is done ... 
elapsed_time: 103.981
Episode: 8 Exploration P: 0.8048 Total reward: -848.6171592884332 SOC: 0.6701 Cumulative_SOC_deviation: 79.7662 Fuel Consumption: 50.9556
maximum steps, simulation is done ... 
elapsed_time: 104.471
Episode: 9 Exploration P: 0.7832 Total reward: -928.6298404849157 SOC: 0.6483 Cumulative_SOC_deviation: 87.9369 Fuel Consumption: 49.2610
maxim

Available condition is not avail... SOC: 1
elapsed_time: 100.693
Episode: 46 Exploration P: 0.2961 Total reward: -3008.570680367199 SOC: 1.0000 Cumulative_SOC_deviation: 291.1958 Fuel Consumption: 96.6128
Available condition is not avail... SOC: 1
elapsed_time: 100.243
Episode: 47 Exploration P: 0.2887 Total reward: -3019.6155514741245 SOC: 1.0000 Cumulative_SOC_deviation: 292.2919 Fuel Consumption: 96.6962
Available condition is not avail... SOC: 1
elapsed_time: 100.469
Episode: 48 Exploration P: 0.2815 Total reward: -2967.4587797157924 SOC: 1.0000 Cumulative_SOC_deviation: 287.1409 Fuel Consumption: 96.0500
Available condition is not avail... SOC: 1
elapsed_time: 100.723
Episode: 49 Exploration P: 0.2745 Total reward: -3054.7130036707367 SOC: 1.0000 Cumulative_SOC_deviation: 295.7690 Fuel Consumption: 97.0225
Available condition is not avail... SOC: 1
elapsed_time: 100.769
Episode: 50 Exploration P: 0.2677 Total reward: -3002.1312120352977 SOC: 1.0000 Cumulative_SOC_deviation: 290.59

Available condition is not avail... SOC: 1
elapsed_time: 101.013
Episode: 86 Exploration P: 0.1107 Total reward: -3341.7487396454912 SOC: 1.0000 Cumulative_SOC_deviation: 323.7324 Fuel Consumption: 104.4250
Available condition is not avail... SOC: 1
elapsed_time: 101.234
Episode: 87 Exploration P: 0.1081 Total reward: -3425.696522009548 SOC: 1.0000 Cumulative_SOC_deviation: 331.8567 Fuel Consumption: 107.1296
Available condition is not avail... SOC: 1
elapsed_time: 101.004
Episode: 88 Exploration P: 0.1056 Total reward: -3396.9713512978965 SOC: 1.0000 Cumulative_SOC_deviation: 329.0045 Fuel Consumption: 106.9263
Available condition is not avail... SOC: 1
elapsed_time: 101.258
Episode: 89 Exploration P: 0.1031 Total reward: -3392.537964387631 SOC: 1.0000 Cumulative_SOC_deviation: 328.6430 Fuel Consumption: 106.1079
Available condition is not avail... SOC: 1
elapsed_time: 101.361
Episode: 90 Exploration P: 0.1007 Total reward: -3418.1125876771534 SOC: 1.0000 Cumulative_SOC_deviation: 331

maximum steps, simulation is done ... 
elapsed_time: 112.819
Episode: 13 Exploration P: 0.7028 Total reward: -1426.8665529193058 SOC: 0.5610 Cumulative_SOC_deviation: 138.4379 Fuel Consumption: 42.4876


In [None]:
with open("DDPG12_2.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)