In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
import scipy.io as sio

from vehicle_model_variant_2 import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
driving_cycle_path = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
driving_cycle = sio.loadmat(driving_cycle_path)
driving_cycle = driving_cycle["sch_cycle"][:, 1]
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_nimh_6_240_panasonic_MY01_Prius.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_pm_95_145_X2.mat"
cell_model = CellModel()
env = Environment(cell_model, driving_cycle, battery_path, motor_path, 1)


In [3]:
# STATE_SIZE = env.calculation_comp["state_size"]
STATE_SIZE = 4
ACTION_SIZE = env.calculation_comp["action_size"] 
LEARNING_RATE = 0.00025 

TOTAL_EPISODES = 200
MAX_STEPS = 50000 

GAMMA = 0.95 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
TAU = 0.001 
DELAY_TRAINING = 3000 
EPSILON_MIN_ITER = 5000

In [4]:
primary_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])
target_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])

primary_network.compile(
    loss="mse", 
    optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
)

# for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
#     t.assign(p)

In [5]:
def update_network(primary_network, target_network): 
    for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
        t.assign(t * (1 - TAU) + p * TAU)

In [6]:
class Memory: 
    def __init__(self, max_memory): 
        self.max_memory = max_memory 
        self._samples = [] 
        
    def add_sample(self, sample): 
        self._samples.append(sample)
        if len(self._samples) > self.max_memory: 
            self._samples.pop(0)
        
    def sample(self, no_samples): 
        if no_samples > len(self._samples): 
            return random.sample(self._samples, len(self._samples))
        else: 
            return random.sample(self._samples, no_samples)
    
    @property
    def num_samples(self):
        return len(self._samples)
    

# memory = Memory(10000)

In [7]:
def choose_action(state, primary_network, eps):
    j_min = state[-2]
    j_max = state[-1]
    
    if random.random() < eps:
        action = random.randint(0, ACTION_SIZE - 1) 
        action_continue = np.linspace(j_min, j_max, ACTION_SIZE)[action]
        return action, action_continue 
    else: 
        action = np.argmax(primary_network(np.array(state).reshape(1, -1)))
        action_continue = np.linspace(j_min, j_max, ACTION_SIZE)[action]
        return action, action_continue 

In [8]:
def train(primary_network, target_network, memory): 
    batch = memory.sample(BATCH_SIZE)
    states = np.array([val[0] for val in batch]) 
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([np.zeros(STATE_SIZE) if val[3] is None else val[3]  
                            for val in batch])
    
    prim_qt = primary_network(states)
    prim_qtp1 = primary_network(next_states)
    target_q = prim_qt.numpy() 
    updates = rewards 
    valid_idxs = next_states.sum(axis=1) != 0 
    batch_idxs = np.arange(BATCH_SIZE)
    prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
    q_from_target = target_network(next_states)
    updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], 
                                                        prim_action_tp1[valid_idxs]]
    
    target_q[batch_idxs, actions] = updates 
    loss = primary_network.train_on_batch(states, target_q)
    return loss 

In [9]:
def initialization_with_rewardFactor(reward_factor):
    env = Environment(cell_model, driving_cycle, battery_path, motor_path, reward_factor)
    
    memory = Memory(10000)
    
    primary_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(),  
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    target_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    primary_network.compile(
        loss="mse", 
        optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
    )
    return env, memory, primary_network, target_network 
    

In [10]:
print("environment version: {}".format(env.version)) 

num_trials = 3 
# reward_factors = [10]
results_dict = {} 

for trial in range(num_trials): 
    eps = MAX_EPSILON 
    steps = 0
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    
    env, memory, primary_network, target_network = initialization_with_rewardFactor(10)
    for episode in range(TOTAL_EPISODES): 
        state = env.reset() 
        avg_loss = 0 
        total_reward = 0
        cnt = 1 

        while True:
            action, action_continue = choose_action(state, primary_network, eps)
            next_state, reward, done = env.step(action_continue)
            total_reward += reward 
            if done: 
                next_state = None 
            memory.add_sample((state, action, reward, next_state))

            if steps > DELAY_TRAINING: 
                loss = train(primary_network, target_network, memory)
                update_network(primary_network, target_network)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)
            else: 
                loss = -1

            avg_loss += loss 
            steps += 1 

            if done: 
                if steps > DELAY_TRAINING: 
                    SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
                    avg_loss /= cnt 
                    print('Episode: {}'.format(episode + 1),
                          'Total reward: {}'.format(total_reward), 
                          'Explore P: {:.4f}'.format(eps), 
                          "SOC: {:.4f}".format(env.SOC), 
                         "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
                         "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
                         )
                else: 
                    print(f"Pre-training...Episode: {episode}")
                
                episode_rewards.append(total_reward)
                episode_SOCs.append(env.SOC)
                episode_FCs.append(env.fuel_consumption)
                break 

            state = next_state 
            cnt += 1 
    
    results_dict[trial] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }
            
    

environment version: 1
maximum steps, simulation is done ... 
Pre-training...Episode: 0
maximum steps, simulation is done ... 
Pre-training...Episode: 1


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

maximum steps, simulation is done ... 
Episode: 3 Total reward: -5077.251283073908 Explore P: 0.9217 SOC: 1.0000 Cumulative_SOC_deviation: 488.7329 Fuel Consumption: 189.9225
maximum steps, simulation is done ... 
Episode: 4 Total reward: -4983.

maximum steps, simulation is done ... 
Episode: 41 Total reward: -531.1191736620214 Explore P: 0.3311 SOC: 0.5800 Cumulative_SOC_deviation: 41.0096 Fuel Consumption: 121.0227
maximum steps, simulation is done ... 
Episode: 42 Total reward: -479.92909228514975 Explore P: 0.3224 SOC: 0.5780 Cumulative_SOC_deviation: 35.8413 Fuel Consumption: 121.5159
maximum steps, simulation is done ... 
Episode: 43 Total reward: -494.0037487650544 Explore P: 0.3140 SOC: 0.5820 Cumulative_SOC_deviation: 37.2903 Fuel Consumption: 121.1008
maximum steps, simulation is done ... 
Episode: 44 Total reward: -526.5384129282735 Explore P: 0.3057 SOC: 0.5734 Cumulative_SOC_deviation: 40.4798 Fuel Consumption: 121.7405
maximum steps, simulation is done ... 
Episode: 45 Total reward: -556.3512465383285 Explore P: 0.2977 SOC: 0.5919 Cumulative_SOC_deviation: 43.4811 Fuel Consumption: 121.5403
maximum steps, simulation is done ... 
Episode: 46 Total reward: -544.1156265346635 Explore P: 0.2899 SOC: 0.5824 Cumulative

maximum steps, simulation is done ... 
Episode: 88 Total reward: -156.01282674686706 Explore P: 0.0983 SOC: 0.6017 Cumulative_SOC_deviation: 4.2788 Fuel Consumption: 113.2250
maximum steps, simulation is done ... 
Episode: 89 Total reward: -151.18564009645948 Explore P: 0.0960 SOC: 0.5998 Cumulative_SOC_deviation: 3.8962 Fuel Consumption: 112.2241
maximum steps, simulation is done ... 
Episode: 90 Total reward: -159.01145166110072 Explore P: 0.0936 SOC: 0.5994 Cumulative_SOC_deviation: 4.6807 Fuel Consumption: 112.2045
maximum steps, simulation is done ... 
Episode: 91 Total reward: -141.2359523992491 Explore P: 0.0914 SOC: 0.5992 Cumulative_SOC_deviation: 2.9661 Fuel Consumption: 111.5747
maximum steps, simulation is done ... 
Episode: 92 Total reward: -158.20255155825518 Explore P: 0.0892 SOC: 0.5984 Cumulative_SOC_deviation: 4.5671 Fuel Consumption: 112.5317
maximum steps, simulation is done ... 
Episode: 93 Total reward: -165.20734351421729 Explore P: 0.0870 SOC: 0.5986 Cumulative_

maximum steps, simulation is done ... 
Episode: 135 Total reward: -155.1114878716294 Explore P: 0.0343 SOC: 0.5967 Cumulative_SOC_deviation: 4.3088 Fuel Consumption: 112.0239
maximum steps, simulation is done ... 
Episode: 136 Total reward: -148.81506076300272 Explore P: 0.0336 SOC: 0.5964 Cumulative_SOC_deviation: 3.7211 Fuel Consumption: 111.6040
maximum steps, simulation is done ... 
Episode: 137 Total reward: -156.89537643598374 Explore P: 0.0330 SOC: 0.5969 Cumulative_SOC_deviation: 4.5125 Fuel Consumption: 111.7700
maximum steps, simulation is done ... 
Episode: 138 Total reward: -158.51126001681473 Explore P: 0.0324 SOC: 0.5954 Cumulative_SOC_deviation: 4.6473 Fuel Consumption: 112.0385
maximum steps, simulation is done ... 
Episode: 139 Total reward: -163.4162816662561 Explore P: 0.0318 SOC: 0.5999 Cumulative_SOC_deviation: 5.2027 Fuel Consumption: 111.3891
maximum steps, simulation is done ... 
Episode: 140 Total reward: -155.0073047414013 Explore P: 0.0312 SOC: 0.5991 Cumulat

maximum steps, simulation is done ... 
Episode: 182 Total reward: -176.91194701762794 Explore P: 0.0167 SOC: 0.5973 Cumulative_SOC_deviation: 6.5669 Fuel Consumption: 111.2427
maximum steps, simulation is done ... 
Episode: 183 Total reward: -146.6414425807239 Explore P: 0.0165 SOC: 0.5977 Cumulative_SOC_deviation: 3.6615 Fuel Consumption: 110.0269
maximum steps, simulation is done ... 
Episode: 184 Total reward: -159.36610354400534 Explore P: 0.0163 SOC: 0.5984 Cumulative_SOC_deviation: 5.0106 Fuel Consumption: 109.2603
maximum steps, simulation is done ... 
Episode: 185 Total reward: -154.0391533181259 Explore P: 0.0162 SOC: 0.6034 Cumulative_SOC_deviation: 4.2543 Fuel Consumption: 111.4966
maximum steps, simulation is done ... 
Episode: 186 Total reward: -142.16174229050915 Explore P: 0.0160 SOC: 0.6028 Cumulative_SOC_deviation: 3.0724 Fuel Consumption: 111.4380
maximum steps, simulation is done ... 
Episode: 187 Total reward: -155.17212605721585 Explore P: 0.0158 SOC: 0.6007 Cumula

In [11]:
with open("DDQN3_2.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [12]:
# with open("results/replay_memory_size_effect.pkl", "rb") as f: 
#     data = pickle.load(f)
    
# data