In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
import scipy.io as sio

from vehicle_model_variant_1 import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
driving_cycle_path = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
driving_cycle = sio.loadmat(driving_cycle_path)
driving_cycle = driving_cycle["sch_cycle"][:, 1]
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_nimh_6_240_panasonic_MY01_Prius.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_pm_95_145_X2.mat"
cell_model = CellModel()
env = Environment(cell_model, driving_cycle, battery_path, motor_path, 1)


In [3]:
# STATE_SIZE = env.calculation_comp["state_size"]
STATE_SIZE = 4
ACTION_SIZE = env.calculation_comp["action_size"] 
LEARNING_RATE = 0.00025 

TOTAL_EPISODES = 200
MAX_STEPS = 50000 

GAMMA = 0.95 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
TAU = 0.001 
DELAY_TRAINING = 3000 
EPSILON_MIN_ITER = 5000

In [4]:
primary_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])
target_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])

primary_network.compile(
    loss="mse", 
    optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
)

# for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
#     t.assign(p)

In [5]:
def update_network(primary_network, target_network): 
    for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
        t.assign(t * (1 - TAU) + p * TAU)

In [6]:
class Memory: 
    def __init__(self, max_memory): 
        self.max_memory = max_memory 
        self._samples = [] 
        
    def add_sample(self, sample): 
        self._samples.append(sample)
        if len(self._samples) > self.max_memory: 
            self._samples.pop(0)
        
    def sample(self, no_samples): 
        if no_samples > len(self._samples): 
            return random.sample(self._samples, len(self._samples))
        else: 
            return random.sample(self._samples, no_samples)
    
    @property
    def num_samples(self):
        return len(self._samples)
    

# memory = Memory(10000)

In [7]:
def choose_action(state, primary_network, eps):
    j_min = state[-2]
    j_max = state[-1]
    
    if random.random() < eps:
        action = random.randint(0, ACTION_SIZE - 1) 
        action_continue = np.linspace(j_min, j_max, ACTION_SIZE)[action]
        return action, action_continue 
    else: 
        action = np.argmax(primary_network(np.array(state).reshape(1, -1)))
        action_continue = np.linspace(j_min, j_max, ACTION_SIZE)[action]
        return action, action_continue 

In [8]:
def train(primary_network, target_network, memory): 
    batch = memory.sample(BATCH_SIZE)
    states = np.array([val[0] for val in batch]) 
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([np.zeros(STATE_SIZE) if val[3] is None else val[3]  
                            for val in batch])
    
    prim_qt = primary_network(states)
    prim_qtp1 = primary_network(next_states)
    target_q = prim_qt.numpy() 
    updates = rewards 
    valid_idxs = next_states.sum(axis=1) != 0 
    batch_idxs = np.arange(BATCH_SIZE)
    prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
    q_from_target = target_network(next_states)
    updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], 
                                                        prim_action_tp1[valid_idxs]]
    
    target_q[batch_idxs, actions] = updates 
    loss = primary_network.train_on_batch(states, target_q)
    return loss 
    
    
    

In [9]:
def initialization_with_rewardFactor(reward_factor):
    env = Environment(cell_model, driving_cycle, battery_path, motor_path, reward_factor)
    
    memory = Memory(10000)
    
    primary_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(),  
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    target_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    primary_network.compile(
        loss="mse", 
        optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
    )
    return env, memory, primary_network, target_network 
    

In [10]:
print("environment version: {}".format(env.version)) 

num_trials = 3 
results_dict = {} 

for trial in range(num_trials): 
    eps = MAX_EPSILON 
    steps = 0
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    
    env, memory, primary_network, target_network = initialization_with_rewardFactor(10)
    for episode in range(TOTAL_EPISODES): 
        state = env.reset() 
        avg_loss = 0 
        total_reward = 0
        cnt = 1 

        while True:
            action, action_continue = choose_action(state, primary_network, eps)
            next_state, reward, done = env.step(action_continue)
            total_reward += reward 
            if done: 
                next_state = None 
            memory.add_sample((state, action, reward, next_state))

            if steps > DELAY_TRAINING: 
                loss = train(primary_network, target_network, memory)
                update_network(primary_network, target_network)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)
            else: 
                loss = -1

            avg_loss += loss 
            steps += 1 

            if done: 
                if steps > DELAY_TRAINING: 
                    SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
                    avg_loss /= cnt 
                    print('Episode: {}'.format(episode + 1),
                          'Total reward: {}'.format(total_reward), 
                          'Explore P: {:.4f}'.format(eps), 
                          "SOC: {:.4f}".format(env.SOC), 
                         "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
                         "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
                         )
                else: 
                    print(f"Pre-training...Episode: {episode}")
                
                episode_rewards.append(total_reward)
                episode_SOCs.append(env.SOC)
                episode_FCs.append(env.fuel_consumption)
                break 

            state = next_state 
            cnt += 1 
    
    results_dict[trial] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }
            
    

environment version: 1
maximum steps, simulation is done ... 
Pre-training...Episode: 0
maximum steps, simulation is done ... 
Pre-training...Episode: 1


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

maximum steps, simulation is done ... 
Episode: 3 Total reward: -5047.567980420613 Explore P: 0.9217 SOC: 1.0000 Cumulative_SOC_deviation: 485.8036 Fuel Consumption: 189.5319
maximum steps, simulation is done ... 
Episode: 4 Total reward: -4897.

maximum steps, simulation is done ... 
Episode: 41 Total reward: -5192.017379876116 Explore P: 0.3311 SOC: 1.0000 Cumulative_SOC_deviation: 499.1914 Fuel Consumption: 200.1029
maximum steps, simulation is done ... 
Episode: 42 Total reward: -5092.2055850297165 Explore P: 0.3224 SOC: 1.0000 Cumulative_SOC_deviation: 490.1311 Fuel Consumption: 190.8942
maximum steps, simulation is done ... 
Episode: 43 Total reward: -5172.928161216047 Explore P: 0.3140 SOC: 0.9994 Cumulative_SOC_deviation: 498.6565 Fuel Consumption: 186.3630
maximum steps, simulation is done ... 
Episode: 44 Total reward: -4732.746053142113 Explore P: 0.3057 SOC: 1.0000 Cumulative_SOC_deviation: 455.3546 Fuel Consumption: 179.2001
maximum steps, simulation is done ... 
Episode: 45 Total reward: -4995.249084327413 Explore P: 0.2977 SOC: 0.9998 Cumulative_SOC_deviation: 480.8974 Fuel Consumption: 186.2751
maximum steps, simulation is done ... 
Episode: 46 Total reward: -4980.157260211343 Explore P: 0.2899 SOC: 0.9999 Cumul

maximum steps, simulation is done ... 
Episode: 88 Total reward: -5024.694940113041 Explore P: 0.0983 SOC: 0.9997 Cumulative_SOC_deviation: 483.8742 Fuel Consumption: 185.9529
maximum steps, simulation is done ... 
Episode: 89 Total reward: -5049.500022109033 Explore P: 0.0960 SOC: 1.0000 Cumulative_SOC_deviation: 487.1340 Fuel Consumption: 178.1601
maximum steps, simulation is done ... 
Episode: 90 Total reward: -5062.010979837224 Explore P: 0.0936 SOC: 1.0000 Cumulative_SOC_deviation: 487.3006 Fuel Consumption: 189.0045
maximum steps, simulation is done ... 
Episode: 91 Total reward: -5228.1864276819315 Explore P: 0.0914 SOC: 0.9962 Cumulative_SOC_deviation: 503.7317 Fuel Consumption: 190.8697
maximum steps, simulation is done ... 
Episode: 92 Total reward: -4991.241864465079 Explore P: 0.0892 SOC: 1.0000 Cumulative_SOC_deviation: 480.4996 Fuel Consumption: 186.2458
maximum steps, simulation is done ... 
Episode: 93 Total reward: -5118.238533985321 Explore P: 0.0870 SOC: 1.0000 Cumul

maximum steps, simulation is done ... 
Episode: 135 Total reward: -4657.894024475742 Explore P: 0.0343 SOC: 0.9963 Cumulative_SOC_deviation: 448.9914 Fuel Consumption: 167.9796
maximum steps, simulation is done ... 
Episode: 136 Total reward: -4494.926209820926 Explore P: 0.0336 SOC: 0.9963 Cumulative_SOC_deviation: 432.0267 Fuel Consumption: 174.6592
maximum steps, simulation is done ... 
Episode: 137 Total reward: -4275.036845912624 Explore P: 0.0330 SOC: 0.9963 Cumulative_SOC_deviation: 409.1677 Fuel Consumption: 183.3601
maximum steps, simulation is done ... 
Episode: 138 Total reward: -4783.329941097326 Explore P: 0.0324 SOC: 0.9891 Cumulative_SOC_deviation: 460.6366 Fuel Consumption: 176.9638
maximum steps, simulation is done ... 
Episode: 139 Total reward: -4656.564650772797 Explore P: 0.0318 SOC: 0.9947 Cumulative_SOC_deviation: 448.8380 Fuel Consumption: 168.1847
maximum steps, simulation is done ... 
Episode: 140 Total reward: -4504.718424575953 Explore P: 0.0312 SOC: 0.9946 

maximum steps, simulation is done ... 
Episode: 182 Total reward: -4996.052946718925 Explore P: 0.0167 SOC: 1.0000 Cumulative_SOC_deviation: 482.1682 Fuel Consumption: 174.3711
maximum steps, simulation is done ... 
Episode: 183 Total reward: -4718.432688056388 Explore P: 0.0165 SOC: 0.9979 Cumulative_SOC_deviation: 454.1654 Fuel Consumption: 176.7783
maximum steps, simulation is done ... 
Episode: 184 Total reward: -4140.844479273612 Explore P: 0.0163 SOC: 0.9979 Cumulative_SOC_deviation: 398.8709 Fuel Consumption: 152.1353
maximum steps, simulation is done ... 
Episode: 185 Total reward: -3533.0949656678704 Explore P: 0.0162 SOC: 0.8997 Cumulative_SOC_deviation: 338.6770 Fuel Consumption: 146.3249
maximum steps, simulation is done ... 
Episode: 186 Total reward: -4098.3210702017095 Explore P: 0.0160 SOC: 0.8954 Cumulative_SOC_deviation: 395.1557 Fuel Consumption: 146.7643
maximum steps, simulation is done ... 
Episode: 187 Total reward: -3610.2353918011536 Explore P: 0.0158 SOC: 0.91

In [11]:
with open("DDQN3_1.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [12]:
# with open("results/replay_memory_size_effect.pkl", "rb") as f: 
#     data = pickle.load(f)
    
# data