In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
from tensorflow.keras import layers
import time 

from vehicle_model_DDPG1 import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_e-4wd_Battery.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_id_75_110_Westinghouse.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 10)

num_states = 4

In [3]:
class OUActionNoise: 
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None): 
        self.theta = theta 
        self.mean = mean 
        self.std_dev = std_deviation 
        self.dt = dt 
        self.x_initial = x_initial 
        self.reset() 
        
    def reset(self): 
        if self.x_initial is not None: 
            self.x_prev = self.x_initial 
        else: 
            self.x_prev = 0 
            
    def __call__(self): 
        x = (
             self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt 
            + self.std_dev * np.sqrt(self.dt) * np.random.normal() 
        )
        self.x_prev = x 
        return x 

In [4]:
class Buffer: 
    def __init__(self, buffer_capacity=100000, batch_size=64):      
        self.buffer_capacity = buffer_capacity 
        self.batch_size = batch_size 
        self.buffer_counter = 0 
        
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, 1))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        
    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity 

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        
        self.buffer_counter += 1 
        
    def learn(self): 
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        
        with tf.GradientTape() as tape: 
            target_actions = target_actor(next_state_batch)
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions])
            critic_value = critic_model([state_batch, action_batch])
            critic_loss = tf.math.reduce_mean(tf.square(y - critic_value)) 
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables) 
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )
        
        with tf.GradientTape() as tape: 
            actions = actor_model(state_batch)
            critic_value = critic_model([state_batch, actions])
            actor_loss = - tf.math.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables) 
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )
        

In [5]:
def update_target(tau): 
    new_weights = [] 
    target_variables = target_critic.weights
    for i, variable in enumerate(critic_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_critic.set_weights(new_weights)
    
    new_weights = [] 
    target_variables = target_actor.weights
    for i, variable in enumerate(actor_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_actor.set_weights(new_weights)
    

In [6]:
def get_actor(): 
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    
    inputs = layers.Input(shape=(num_states))
    inputs_batchnorm = layers.BatchNormalization()(inputs)
    
    out = layers.Dense(512, activation="relu")(inputs_batchnorm)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1, activation="sigmoid", 
                          kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [7]:
def get_critic(): 
    state_input = layers.Input(shape=(num_states))
    state_input_batchnorm = layers.BatchNormalization()(state_input)
    
    state_out = layers.Dense(16, activation="relu")(state_input_batchnorm)
#     state_out = layers.BatchNormalization()(state_out)
    state_out = layers.Dense(32, activation="relu")(state_out)
#     state_out = layers.BatchNormalization()(state_out)
    
    action_input = layers.Input(shape=(1))
    action_out = layers.Dense(32, activation="relu")(action_input)
#     action_out = layers.BatchNormalization()(action_out)
    
    concat = layers.Concatenate()([state_out, action_out]) 
    
    out = layers.Dense(512, activation="relu")(concat)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1)(out)
    
    model = tf.keras.Model([state_input, action_input], outputs)
    return model 
    

In [8]:
def policy(state, noise_object): 
    j_min = state[0][2].numpy()
    j_max = state[0][3].numpy()
    sampled_action = tf.squeeze(actor_model(state)) 
    noise = noise_object()
    sampled_action = sampled_action.numpy() + noise 
    legal_action = sampled_action * j_max 
    legal_action = np.clip(legal_action, j_min, j_max)
#     print(j_min, j_max, legal_action, noise)
    return legal_action 
    

In [9]:
def policy_epsilon_greedy(state, eps): 
    j_min = state[0][-2].numpy()
    j_max = state[0][-1].numpy()

    if random.random() < eps: 
        a = random.randint(0, 9)
        return np.linspace(j_min, j_max, 10)[a]
    else: 
        sampled_action = tf.squeeze(actor_model(state)).numpy()  
        legal_action = sampled_action * j_max 
        legal_action = np.clip(legal_action, j_min, j_max)
        return legal_action

In [10]:
std_dev = 0.2 
ou_noise = OUActionNoise(mean=0, std_deviation=0.2)

critic_lr = 0.0005 
actor_lr = 0.00025 
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 200
gamma = 0.95 
tau = 0.001 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
DELAY_TRAINING = 3000 

In [11]:
def initialization(): 
    actor_model = get_actor() 
    critic_model = get_critic() 

    target_actor = get_actor() 
    target_critic = get_critic() 
    target_actor.set_weights(actor_model.get_weights())
    target_critic.set_weights(critic_model.get_weights())
    
    buffer = Buffer(500000, BATCH_SIZE)
    return actor_model, critic_model, target_actor, target_critic, buffer

In [12]:
def save_weights(actor_model, critic_model, target_actor, target_critic, root): 
    actor_model.save_weights("./{}/actor_model_checkpoint".format(root))
    critic_model.save_weights("./{}/critic_model_checkpoint".format(root))
    target_actor.save_weights("./{}/target_actor_checkpoint".format(root))
    target_critic.save_weights("./{}/target_critic_checkpoint".format(root))
    print("model is saved..")

In [None]:
print(env.version)

num_trials = 3
results_dict = {} 
for trial in range(num_trials): 
    print()
    print("Trial {}".format(trial))
    
    actor_model, critic_model, target_actor, target_critic, buffer = initialization()
    
    eps = MAX_EPSILON 
    steps = 0
    
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    for ep in range(total_episodes): 
        start = time.time() 
        state = env.reset() 
        episodic_reward = 0 

        while True: 
            tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
            action = policy_epsilon_greedy(tf_state, eps)
    #         print(action)
            next_state, reward, done = env.step(action)
            if done: 
                next_state = [0] * num_states 

            buffer.record((state, action, reward, next_state))
            episodic_reward += reward 

            if steps > DELAY_TRAINING: 
                buffer.learn() 
                update_target(tau)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)

            steps += 1

            if done: 
                break 

            state = next_state 

        elapsed_time = time.time() - start 
        print("elapsed_time: {:.3f}".format(elapsed_time))
        episode_rewards.append(episodic_reward) 
        episode_SOCs.append(env.SOC)
        episode_FCs.append(env.fuel_consumption) 

    #     print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
        SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
        print(
              'Episode: {}'.format(ep + 1),
              "Exploration P: {:.4f}".format(eps),
              'Total reward: {}'.format(episodic_reward), 
              "SOC: {:.4f}".format(env.SOC), 
              "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
              "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
        )
    
    root = "DDPG1_trial{}".format(trial+1)
    save_weights(actor_model, critic_model, target_actor, target_critic, root)
    
    results_dict[trial + 1] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }

1

Trial 0
maximum steps, simulation is done ... 
elapsed_time: 16.211
Episode: 1 Exploration P: 1.0000 Total reward: -885.6240229025988 SOC: 0.7760 Cumulative_SOC_deviation: 82.6443 Fuel Consumption: 59.1808
maximum steps, simulation is done ... 
elapsed_time: 16.322
Episode: 2 Exploration P: 1.0000 Total reward: -1046.5361810815275 SOC: 0.8126 Cumulative_SOC_deviation: 98.4507 Fuel Consumption: 62.0288


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer cons

maximum steps, simulation is done ... 
elapsed_time: 76.668
Episode: 28 Exploration P: 0.4689 Total reward: -2572.8692460769494 SOC: 0.3816 Cumulative_SOC_deviation: 254.3472 Fuel Consumption: 29.3977
maximum steps, simulation is done ... 
elapsed_time: 81.527
Episode: 29 Exploration P: 0.4565 Total reward: -2668.8558523329675 SOC: 0.3586 Cumulative_SOC_deviation: 264.1328 Fuel Consumption: 27.5276
maximum steps, simulation is done ... 
elapsed_time: 84.060
Episode: 30 Exploration P: 0.4444 Total reward: -2388.1859985606093 SOC: 0.3996 Cumulative_SOC_deviation: 235.7477 Fuel Consumption: 30.7094
maximum steps, simulation is done ... 
elapsed_time: 74.517
Episode: 31 Exploration P: 0.4326 Total reward: -2664.7562093266993 SOC: 0.3644 Cumulative_SOC_deviation: 263.6527 Fuel Consumption: 28.2294
maximum steps, simulation is done ... 
elapsed_time: 74.472
Episode: 32 Exploration P: 0.4212 Total reward: -2801.549674598616 SOC: 0.3486 Cumulative_SOC_deviation: 277.4624 Fuel Consumption: 26.9

maximum steps, simulation is done ... 
elapsed_time: 78.751
Episode: 70 Exploration P: 0.1548 Total reward: -268.9687101072317 SOC: 0.6027 Cumulative_SOC_deviation: 22.4135 Fuel Consumption: 44.8342
maximum steps, simulation is done ... 
elapsed_time: 77.775
Episode: 71 Exploration P: 0.1509 Total reward: -290.6629113481004 SOC: 0.6027 Cumulative_SOC_deviation: 24.5831 Fuel Consumption: 44.8322
maximum steps, simulation is done ... 
elapsed_time: 77.959
Episode: 72 Exploration P: 0.1471 Total reward: -293.10938598315516 SOC: 0.6017 Cumulative_SOC_deviation: 24.8359 Fuel Consumption: 44.7501
maximum steps, simulation is done ... 
elapsed_time: 77.928
Episode: 73 Exploration P: 0.1434 Total reward: -265.65560259728386 SOC: 0.6008 Cumulative_SOC_deviation: 22.0953 Fuel Consumption: 44.7024
maximum steps, simulation is done ... 
elapsed_time: 77.887
Episode: 74 Exploration P: 0.1398 Total reward: -264.41317948139596 SOC: 0.6006 Cumulative_SOC_deviation: 21.9615 Fuel Consumption: 44.7980
ma

maximum steps, simulation is done ... 
elapsed_time: 74.494
Episode: 112 Exploration P: 0.0557 Total reward: -266.836619848691 SOC: 0.5954 Cumulative_SOC_deviation: 22.2672 Fuel Consumption: 44.1647
maximum steps, simulation is done ... 
elapsed_time: 76.340
Episode: 113 Exploration P: 0.0545 Total reward: -296.68402006438185 SOC: 0.5949 Cumulative_SOC_deviation: 25.2681 Fuel Consumption: 44.0026
maximum steps, simulation is done ... 
elapsed_time: 74.589
Episode: 114 Exploration P: 0.0533 Total reward: -270.7472403645104 SOC: 0.6009 Cumulative_SOC_deviation: 22.6251 Fuel Consumption: 44.4964
maximum steps, simulation is done ... 
elapsed_time: 74.213
Episode: 115 Exploration P: 0.0521 Total reward: -240.07629279229454 SOC: 0.6015 Cumulative_SOC_deviation: 19.5523 Fuel Consumption: 44.5535
maximum steps, simulation is done ... 
elapsed_time: 74.140
Episode: 116 Exploration P: 0.0510 Total reward: -291.1375327239198 SOC: 0.5979 Cumulative_SOC_deviation: 24.7048 Fuel Consumption: 44.0897

maximum steps, simulation is done ... 
elapsed_time: 75.497
Episode: 153 Exploration P: 0.0248 Total reward: -218.93856159686595 SOC: 0.6031 Cumulative_SOC_deviation: 17.4194 Fuel Consumption: 44.7449
maximum steps, simulation is done ... 
elapsed_time: 75.609
Episode: 154 Exploration P: 0.0244 Total reward: -236.03521172531944 SOC: 0.6048 Cumulative_SOC_deviation: 19.1210 Fuel Consumption: 44.8249
maximum steps, simulation is done ... 
elapsed_time: 75.599
Episode: 155 Exploration P: 0.0240 Total reward: -203.0522599951894 SOC: 0.6046 Cumulative_SOC_deviation: 15.8255 Fuel Consumption: 44.7973
maximum steps, simulation is done ... 
elapsed_time: 75.233
Episode: 156 Exploration P: 0.0237 Total reward: -199.3022994980784 SOC: 0.6041 Cumulative_SOC_deviation: 15.4479 Fuel Consumption: 44.8234
maximum steps, simulation is done ... 
elapsed_time: 75.330
Episode: 157 Exploration P: 0.0233 Total reward: -231.2137624267459 SOC: 0.6047 Cumulative_SOC_deviation: 18.6298 Fuel Consumption: 44.915

maximum steps, simulation is done ... 
elapsed_time: 75.457
Episode: 194 Exploration P: 0.0148 Total reward: -199.08282703529127 SOC: 0.6040 Cumulative_SOC_deviation: 15.4218 Fuel Consumption: 44.8653
maximum steps, simulation is done ... 
elapsed_time: 75.291
Episode: 195 Exploration P: 0.0147 Total reward: -202.03201524614371 SOC: 0.6030 Cumulative_SOC_deviation: 15.7295 Fuel Consumption: 44.7373
maximum steps, simulation is done ... 
elapsed_time: 75.441
Episode: 196 Exploration P: 0.0146 Total reward: -196.36690795049137 SOC: 0.6020 Cumulative_SOC_deviation: 15.1761 Fuel Consumption: 44.6055
maximum steps, simulation is done ... 
elapsed_time: 75.190
Episode: 197 Exploration P: 0.0144 Total reward: -203.86692197414928 SOC: 0.6046 Cumulative_SOC_deviation: 15.9081 Fuel Consumption: 44.7854
maximum steps, simulation is done ... 
elapsed_time: 78.133
Episode: 198 Exploration P: 0.0143 Total reward: -194.34098287018506 SOC: 0.6029 Cumulative_SOC_deviation: 14.9707 Fuel Consumption: 44.

maximum steps, simulation is done ... 
elapsed_time: 89.349
Episode: 21 Exploration P: 0.5662 Total reward: -2061.4163167342717 SOC: 0.4590 Cumulative_SOC_deviation: 202.6544 Fuel Consumption: 34.8722
maximum steps, simulation is done ... 
elapsed_time: 89.416
Episode: 22 Exploration P: 0.5511 Total reward: -2094.8116327678504 SOC: 0.4526 Cumulative_SOC_deviation: 206.0350 Fuel Consumption: 34.4614
maximum steps, simulation is done ... 
elapsed_time: 89.315
Episode: 23 Exploration P: 0.5364 Total reward: -2140.1109590409383 SOC: 0.4464 Cumulative_SOC_deviation: 210.6133 Fuel Consumption: 33.9778
maximum steps, simulation is done ... 
elapsed_time: 89.032
Episode: 24 Exploration P: 0.5222 Total reward: -2192.2509371580327 SOC: 0.4342 Cumulative_SOC_deviation: 215.9070 Fuel Consumption: 33.1808
maximum steps, simulation is done ... 
elapsed_time: 92.636
Episode: 25 Exploration P: 0.5083 Total reward: -1417.116707995286 SOC: 0.6714 Cumulative_SOC_deviation: 136.6182 Fuel Consumption: 50.9

In [None]:
with open("DDPG1.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)