In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
from tensorflow.keras import layers

from vehicle_model_DDPG import Environment 
from cell_model import CellModel 

# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_e-4wd_Battery.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_id_75_110_Westinghouse.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 10)

num_states = 4

In [3]:
class OUActionNoise: 
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None): 
        self.theta = theta 
        self.mean = mean 
        self.std_dev = std_deviation 
        self.dt = dt 
        self.x_initial = x_initial 
        self.reset() 
        
    def reset(self): 
        if self.x_initial is not None: 
            self.x_prev = self.x_initial 
        else: 
            self.x_prev = 0 
            
    def __call__(self): 
        x = (
             self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt 
            + self.std_dev * np.sqrt(self.dt) * np.random.normal() 
        )
        self.x_prev = x 
        return x 

In [4]:
class Buffer: 
    def __init__(self, buffer_capacity=100000, batch_size=64): 
        self.buffer_capacity = buffer_capacity 
        self.batch_size = batch_size 
        self.buffer_counter = 0 
        
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, 1))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        
    def record(self, obs_tuple): 
        index = self.buffer_counter % self.buffer_capacity 
        
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        
        self.buffer_counter += 1 
        
    def learn(self): 
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)
        
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        
        with tf.GradientTape() as tape: 
            target_actions = target_actor(next_state_batch)
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions])
            critic_value = critic_model([state_batch, action_batch])
            critic_loss = tf.math.reduce_mean(tf.square(y - critic_value)) 
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables) 
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )
        
        with tf.GradientTape() as tape: 
            actions = actor_model(state_batch)
            critic_value = critic_model([state_batch, actions])
            actor_loss = - tf.math.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables) 
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )
        

In [5]:
def update_target(tau): 
    new_weights = [] 
    target_variables = target_critic.weights
    for i, variable in enumerate(critic_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_critic.set_weights(new_weights)
    
    new_weights = [] 
    target_variables = target_actor.weights
    for i, variable in enumerate(actor_model.weights): 
        new_weights.append(target_variables[i] * (1 - tau) + tau * variable)
    target_actor.set_weights(new_weights)
    

In [6]:
def get_actor(): 
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    
    inputs = layers.Input(shape=(num_states))
    out = layers.Dense(512, activation="relu")(inputs)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1, activation="sigmoid", 
                          kernel_initializer=last_init)(out)
    model = tf.keras.Model(inputs, outputs)
    return model

In [7]:
def get_critic(): 
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
#     state_out = layers.BatchNormalization()(state_out)
    state_out = layers.Dense(32, activation="relu")(state_input)
#     state_out = layers.BatchNormalization()(state_out)
    
    action_input = layers.Input(shape=(1))
    action_out = layers.Dense(32, activation="relu")(action_input)
#     action_out = layers.BatchNormalization()(action_out)
    
    concat = layers.Concatenate()([state_out, action_out]) 
    
    out = layers.Dense(512, activation="relu")(concat)
#     out = layers.BatchNormalization()(out)
    out = layers.Dense(512, activation="relu")(out)
#     out = layers.BatchNormalization()(out)
    outputs = layers.Dense(1)(out)
    
    model = tf.keras.Model([state_input, action_input], outputs)
    return model 
    

In [8]:
def policy(state, noise_object): 
    j_min = state[0][2].numpy()
    j_max = state[0][3].numpy()
    sampled_action = tf.squeeze(actor_model(state)) 
    noise = noise_object()
    sampled_action = sampled_action.numpy() + noise 
    legal_action = sampled_action * j_max 
    legal_action = np.clip(legal_action, j_min, j_max)
#     print(j_min, j_max, legal_action, noise)
    return legal_action 
    

In [9]:
def policy_epsilon_greedy(state, eps): 
    j_min = state[0][2].numpy()
    j_max = state[0][3].numpy()
    
    if random.random() < eps: 
        a = random.randint(0, 9)
        return np.linspace(j_min, j_max, 10)[a]
    else: 
        sampled_action = tf.squeeze(actor_model(state)).numpy()  
        legal_action = sampled_action * j_max 
        legal_action = np.clip(legal_action, j_min, j_max)
        return legal_action

In [10]:
std_dev = 0.2 
ou_noise = OUActionNoise(mean=0, std_deviation=0.2)

actor_model = get_actor() 
critic_model = get_critic() 

target_actor = get_actor() 
target_critic = get_critic() 
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

critic_lr = 0.002 
actor_lr = 0.001 
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 200 
gamma = 0.95 
tau = 0.001 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002 
BATCH_SIZE = 32 

In [None]:
print(env.version)

buffer = Buffer(50000, BATCH_SIZE)
eps = MAX_EPSILON 
steps = 0 

ep_reward_list = [] 
avg_reward_list = [] 

for ep in range(total_episodes): 
    state = env.reset() 
    episodic_reward = 0 
    
    while True: 
        tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
        action = policy_epsilon_greedy(tf_state, eps)
#         print(action)
        next_state, reward, done = env.step(action)
        if done: 
            next_state = [0] * num_states 
        
        buffer.record((state, action, reward, next_state))
        episodic_reward += reward 
        
        buffer.learn() 
        update_target(tau)
        steps += 1 
        
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)
        
        if done: 
            break 
        
        state = next_state 
    
    ep_reward_list.append(episodic_reward) 
    avg_reward = np.mean(ep_reward_list[-40:])
    avg_reward_list.append(avg_reward)
    
#     print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
    print(
          'Episode: {}'.format(ep + 1),
          'Total reward: {}'.format(episodic_reward), 
          "SOC: {:.4f}".format(env.SOC), 
          "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
          "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
    )

0
maximum steps, simulation is done ... 
Episode: 1 Total reward: -935.9290733229453 SOC: 0.7970 Cumulative_SOC_deviation: 87.5138 Fuel Consumption: 60.7910
maximum steps, simulation is done ... 
Episode: 2 Total reward: -843.7325456267621 SOC: 0.7584 Cumulative_SOC_deviation: 78.5961 Fuel Consumption: 57.7718
maximum steps, simulation is done ... 
Episode: 3 Total reward: -883.4164374922801 SOC: 0.7645 Cumulative_SOC_deviation: 82.5274 Fuel Consumption: 58.1420
maximum steps, simulation is done ... 
Episode: 4 Total reward: -772.5179636428113 SOC: 0.7399 Cumulative_SOC_deviation: 71.6292 Fuel Consumption: 56.2258
maximum steps, simulation is done ... 
Episode: 5 Total reward: -768.9001875878322 SOC: 0.7191 Cumulative_SOC_deviation: 71.4262 Fuel Consumption: 54.6381
maximum steps, simulation is done ... 
Episode: 6 Total reward: -719.6701662205257 SOC: 0.7053 Cumulative_SOC_deviation: 66.6112 Fuel Consumption: 53.5586
maximum steps, simulation is done ... 
Episode: 7 Total reward: -758

Available condition is not avail... SOC: 1
Episode: 53 Total reward: -3150.3252689877845 SOC: 1.0000 Cumulative_SOC_deviation: 305.1149 Fuel Consumption: 99.1762
Available condition is not avail... SOC: 1
Episode: 54 Total reward: -3140.0588983073317 SOC: 1.0000 Cumulative_SOC_deviation: 304.1104 Fuel Consumption: 98.9547
Available condition is not avail... SOC: 1
Episode: 55 Total reward: -3095.0140961242246 SOC: 1.0000 Cumulative_SOC_deviation: 299.6206 Fuel Consumption: 98.8081
Available condition is not avail... SOC: 1
Episode: 56 Total reward: -3107.425984472269 SOC: 1.0000 Cumulative_SOC_deviation: 300.8737 Fuel Consumption: 98.6894
Available condition is not avail... SOC: 1
Episode: 57 Total reward: -3117.2843939553 SOC: 1.0000 Cumulative_SOC_deviation: 301.7903 Fuel Consumption: 99.3816
Available condition is not avail... SOC: 1
Episode: 58 Total reward: -3149.891740207598 SOC: 1.0000 Cumulative_SOC_deviation: 304.9941 Fuel Consumption: 99.9508
Available condition is not avail.