In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 
import glob 

from vehicle_model_DDQN1 import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_nimh_6_240_panasonic_MY01_Prius.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_pm_95_145_X2.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 1)


In [3]:
# STATE_SIZE = env.calculation_comp["state_size"]
STATE_SIZE = 4
ACTION_SIZE = env.calculation_comp["action_size"] 
LEARNING_RATE = 0.00025 

TOTAL_EPISODES = 500
MAX_STEPS = 50000 

GAMMA = 0.95 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
TAU = 0.001 
DELAY_TRAINING = 10000
EPSILON_MIN_ITER = 5000

In [4]:
primary_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])
target_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])

primary_network.compile(
    loss="mse", 
    optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
)

# for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
#     t.assign(p)

In [5]:
def update_network(primary_network, target_network): 
    for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
        t.assign(t * (1 - TAU) + p * TAU)

In [6]:
class Memory: 
    def __init__(self, max_memory): 
        self.max_memory = max_memory 
        self._samples = [] 
        
    def add_sample(self, sample): 
        self._samples.append(sample)
        if len(self._samples) > self.max_memory: 
            self._samples.pop(0)
        
    def sample(self, no_samples): 
        if no_samples > len(self._samples): 
            return random.sample(self._samples, len(self._samples))
        else: 
            return random.sample(self._samples, no_samples)
    
    @property
    def num_samples(self):
        return len(self._samples)
    

# memory = Memory(10000)

In [7]:
def choose_action(state, primary_network, eps): 
    if random.random() < eps: 
        return random.randint(0, ACTION_SIZE - 1)
    else: 
        return np.argmax(primary_network(np.array(state).reshape(1, -1))) 

In [8]:
def train(primary_network, target_network, memory): 
    batch = memory.sample(BATCH_SIZE)
    states = np.array([val[0] for val in batch]) 
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([np.zeros(STATE_SIZE) if val[3] is None else val[3]  
                            for val in batch])
    
    prim_qt = primary_network(states)
    prim_qtp1 = primary_network(next_states)
    target_q = prim_qt.numpy() 
    updates = rewards 
    valid_idxs = next_states.sum(axis=1) != 0 
    batch_idxs = np.arange(BATCH_SIZE)
    prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
    q_from_target = target_network(next_states)
    updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], 
                                                        prim_action_tp1[valid_idxs]]
    
    target_q[batch_idxs, actions] = updates 
    loss = primary_network.train_on_batch(states, target_q)
    return loss 

In [9]:
def initialization():
    memory = Memory(10000)
    
    primary_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", input_shape=[STATE_SIZE], 
                           kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(),  
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    target_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", input_shape=[STATE_SIZE], 
                           kernel_initializer=keras.initializers.he_normal()), 
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    primary_network.compile(
        loss="mse", 
        optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
    )
    return memory, primary_network, target_network 
    

In [10]:
def initialization_env(driving_path, reward_factor):
    env = Environment(cell_model, driving_path, battery_path, motor_path, reward_factor)
    return env 

In [11]:
def save_weights(primary_net, target_net, root): 
    primary_net.save_weights("./{}/primary_net_checkpoint".format(root))
    target_net.save_weights("./{}/target_net_checkpoint".format(root))
    print("model is saved..")

In [12]:
print("environment version: {}".format(env.version)) 

 
reward_factors = [10] * 3
results_dict = {} 
driving_cycle_paths = glob.glob("training/*.mat")
# driving_cycle_paths.pop(1)
driving_cycle_paths = driving_cycle_paths[:15]

for trial, reward_factor in enumerate(reward_factors): 
    eps = MAX_EPSILON 
    steps = 0
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    
    memory, primary_network, target_network = initialization()
#     for episode in range(TOTAL_EPISODES):
    for episode in range(TOTAL_EPISODES): 
        driving_cycle_path = np.random.choice(driving_cycle_paths)
        print(driving_cycle_path)
        env = initialization_env(driving_cycle_path, 10)
        state = env.reset() 

        avg_loss = 0 
        total_reward = 0
        cnt = 1 

        while True:
            action = choose_action(state, primary_network, eps)
            next_state, reward, done = env.step(action)
            total_reward += reward 
            if done: 
                next_state = None 
            memory.add_sample((state, action, reward, next_state))

            if steps > DELAY_TRAINING: 
                loss = train(primary_network, target_network, memory)
                update_network(primary_network, target_network)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * (steps - 
                                                                        DELAY_TRAINING))
            else: 
                loss = -1

            avg_loss += loss 
            steps += 1 

            if done: 
                SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
                avg_loss /= cnt 
                print('Episode: {}'.format(episode),
                      'Total reward: {}'.format(total_reward), 
                      'Explore P: {:.4f}'.format(eps), 
                      "SOC: {:.4f}".format(env.SOC), 
                     "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
                     "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
                     )
                
                episode_rewards.append(total_reward)
                episode_SOCs.append(env.SOC)
                episode_FCs.append(env.fuel_consumption)
                break 

            state = next_state 
            cnt += 1 
    
    root = "DDQN6_trial{}".format(trial+2)
    save_weights(primary_network, target_network, root)
    
    results_dict[reward_factor] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }
            
    

environment version: 1
training\manhattan.mat
maximum steps, simulation is done ... 
Episode: 0 Total reward: -2781.0539897846947 Explore P: 1.0000 SOC: 1.0000 Cumulative_SOC_deviation: 273.1251 Fuel Consumption: 49.8026
training\VITO_RW_BUS_TMB_Line24N_1.mat
maximum steps, simulation is done ... 
Episode: 1 Total reward: -10251.209091366727 Explore P: 1.0000 SOC: 1.0000 Cumulative_SOC_deviation: 1012.1331 Fuel Consumption: 129.8781
training\VITO_DUBDC.mat
maximum steps, simulation is done ... 
Episode: 2 Total reward: -1301.8109333974578 Explore P: 1.0000 SOC: 0.8971 Cumulative_SOC_deviation: 126.2957 Fuel Consumption: 38.8543
training\FTP_75_cycle.mat
maximum steps, simulation is done ... 
Episode: 3 Total reward: -2031.8141513714315 Explore P: 1.0000 SOC: 0.7518 Cumulative_SOC_deviation: 194.8475 Fuel Consumption: 83.3390
training\ny_city_composite_truck.mat
maximum steps, simulation is done ... 
Episode: 4 Total reward: -2632.6442283069073 Explore P: 1.0000 SOC: 1.0000 Cumulative_S

  del_i = (1 / (2 * r_cha)) * (v_cha - (v_cha ** 2 - 4 * r_cha * p_bat) ** (0.5)) * (p_bat < 0) + (1 / (


maximum steps, simulation is done ... 
Episode: 13 Total reward: -1799.883493848245 Explore P: 0.7996 SOC: 0.0529 Cumulative_SOC_deviation: 177.6541 Fuel Consumption: 23.3425
training\06_udds_truck.mat
maximum steps, simulation is done ... 
Episode: 14 Total reward: -1027.4193459872065 Explore P: 0.7830 SOC: 0.6239 Cumulative_SOC_deviation: 98.2415 Fuel Consumption: 45.0046
training\FTP_75_cycle.mat


KeyboardInterrupt: 

In [None]:
with open("DDQN6.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open("results/replay_memory_size_effect.pkl", "rb") as f: 
#     data = pickle.load(f)
    
# data