In [1]:
import tensorflow as tf 
import numpy as np 
from tensorflow import keras 
import os 
import math 
import random 
import pickle 
import matplotlib.pyplot as plt 
from collections import deque 

from vehicle_model_DDQN import Environment 
from cell_model import CellModel 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
drving_cycle = '../../OC_SIM_DB/OC_SIM_DB_Cycles/Highway/01_FTP72_fuds.mat'
battery_path = "../../OC_SIM_DB/OC_SIM_DB_Bat/OC_SIM_DB_Bat_nimh_6_240_panasonic_MY01_Prius.mat"
motor_path = "../../OC_SIM_DB/OC_SIM_DB_Mot/OC_SIM_DB_Mot_pm_95_145_X2.mat"
cell_model = CellModel()
env = Environment(cell_model, drving_cycle, battery_path, motor_path, 1)


In [3]:
# STATE_SIZE = env.calculation_comp["state_size"]
STATE_SIZE = 4
ACTION_SIZE = env.calculation_comp["action_size"] 
LEARNING_RATE = 0.00025 

TOTAL_EPISODES = 200
MAX_STEPS = 50000 

GAMMA = 0.95 

MAX_EPSILON = 1 
MIN_EPSILON = 0.01 
DECAY_RATE = 0.00002
BATCH_SIZE = 32 
TAU = 0.001 
DELAY_TRAINING = 3000 
EPSILON_MIN_ITER = 5000

In [4]:
primary_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])
target_network = keras.Sequential([
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#     keras.layers.BatchNormalization(), 
    keras.layers.Dense(ACTION_SIZE),
])

primary_network.compile(
    loss="mse", 
    optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
)

# for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
#     t.assign(p)

In [5]:
def update_network(primary_network, target_network): 
    for t, p in zip(target_network.trainable_variables, primary_network.trainable_variables): 
        t.assign(t * (1 - TAU) + p * TAU)

In [6]:
class Memory: 
    def __init__(self, max_memory): 
        self.max_memory = max_memory 
        self._samples = [] 
        
        self.power_mean = 0 
        self.power_std = 0 
        self.sum = 0 
        self.sum_deviation = 0 
        self.N = 0 
        
    def add_sample(self, sample): 
        self.N += 1 
        power = sample[0][0]
        self.sum += power 
        self.power_mean = self.sum / self.N 
        self.sum_deviation += (power - self.power_mean) ** 2
        self.power_std = np.sqrt(self.sum_deviation / self.N)
        
        self._samples.append(sample)
        if len(self._samples) > self.max_memory: 
            self._samples.pop(0)
        
    def sample(self, no_samples): 
        if no_samples > len(self._samples): 
            return random.sample(self._samples, len(self._samples))
        else: 
            return random.sample(self._samples, no_samples)
    
    @property
    def num_samples(self):
        return len(self._samples)
    

# memory = Memory(10000)

In [7]:
def choose_action(state, primary_network, eps): 
    if random.random() < eps: 
        return random.randint(0, ACTION_SIZE - 1)
    else: 
        return np.argmax(primary_network(np.array(state).reshape(1, -1))) 

In [8]:
def train(primary_network, target_network, memory): 
    batch = memory.sample(BATCH_SIZE)
    states = np.array([val[0] for val in batch]) 
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([np.zeros(STATE_SIZE) if val[3] is None else val[3]  
                            for val in batch])
    
    states[:, 0] = (states[:, 0] - memory.power_mean) / memory.power_std 
    next_states[:, 0] = (next_states[:, 0] - memory.power_mean) / memory.power_std
    
    prim_qt = primary_network(states)
    prim_qtp1 = primary_network(next_states)
    target_q = prim_qt.numpy() 
    updates = rewards 
    valid_idxs = next_states.sum(axis=1) != 0 
    batch_idxs = np.arange(BATCH_SIZE)
    prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
    q_from_target = target_network(next_states)
    updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], 
                                                        prim_action_tp1[valid_idxs]]
    
    target_q[batch_idxs, actions] = updates 
    loss = primary_network.train_on_batch(states, target_q)
    return loss 
    
    
    

In [9]:
def initialization_with_rewardFactor(reward_factor):
    env = Environment(cell_model, drving_cycle, battery_path, motor_path, reward_factor)
    
    memory = Memory(10000)
    
    primary_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(),  
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    target_network = keras.Sequential([
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()), 
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(30, activation="relu", kernel_initializer=keras.initializers.he_normal()),
#         keras.layers.BatchNormalization(), 
        keras.layers.Dense(ACTION_SIZE),
    ])
    primary_network.compile(
        loss="mse", 
        optimizer=keras.optimizers.Adam(lr=LEARNING_RATE) 
    )
    return env, memory, primary_network, target_network 
    

In [10]:
print("environment version: {}".format(env.version)) 

reward_factors = [10]
results_dict = {} 
num_trials = 3

for trial in range(num_trials): 
    eps = MAX_EPSILON 
    steps = 0
    episode_rewards = [] 
    episode_SOCs = [] 
    episode_FCs = [] 
    
    env, memory, primary_network, target_network = initialization_with_rewardFactor(10)
    for episode in range(TOTAL_EPISODES): 
        state = env.reset() 
        avg_loss = 0 
        total_reward = 0
        cnt = 1 

        while True:
            action = choose_action(state, primary_network, eps)
            next_state, reward, done = env.step(action)
            total_reward += reward 
            if done: 
                next_state = None 
            memory.add_sample((state, action, reward, next_state))

            if steps > DELAY_TRAINING: 
                loss = train(primary_network, target_network, memory)
                update_network(primary_network, target_network)
                eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-DECAY_RATE * steps)
            else: 
                loss = -1

            avg_loss += loss 
            steps += 1 

            if done: 
                if steps > DELAY_TRAINING: 
                    SOC_deviation_history = np.sum(np.abs(np.array(env.history["SOC"]) - 0.6)) 
                    avg_loss /= cnt 
                    print('Episode: {}'.format(episode + 1),
                          'Total reward: {}'.format(total_reward), 
                          'Explore P: {:.4f}'.format(eps), 
                          "SOC: {:.4f}".format(env.SOC), 
                         "Cumulative_SOC_deviation: {:.4f}".format(SOC_deviation_history), 
                         "Fuel Consumption: {:.4f}".format(env.fuel_consumption), 
                        "Mean: {:.4f}, STD: {:.4f}".format(memory.power_mean, memory.power_std)
                         )
                else: 
                    print(f"Pre-training...Episode: {episode}")
                
                episode_rewards.append(total_reward)
                episode_SOCs.append(env.SOC)
                episode_FCs.append(env.fuel_consumption)
                break 

            state = next_state 
            cnt += 1 
    
    results_dict[trial + 1] = {
        "rewards": episode_rewards, 
        "SOCs": episode_SOCs, 
        "FCs": episode_FCs 
    }
            
    

environment version: 0
maximum steps, simulation is done ... 
Pre-training...Episode: 0
maximum steps, simulation is done ... 
Pre-training...Episode: 1


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

maximum steps, simulation is done ... 
Episode: 3 Total reward: -923.6689841056224 Explore P: 0.9217 SOC: 0.7945 Cumulative_SOC_deviation: 86.3274 Fuel Consumption: 60.3952 Mean: 2.1068, STD: 5.0145
maximum steps, simulation is done ... 
Episode

maximum steps, simulation is done ... 
Episode: 37 Total reward: -869.0773354564245 Explore P: 0.3684 SOC: 0.7558 Cumulative_SOC_deviation: 81.2185 Fuel Consumption: 56.8928 Mean: 2.1068, STD: 5.0177
maximum steps, simulation is done ... 
Episode: 38 Total reward: -794.0058563131655 Explore P: 0.3587 SOC: 0.7164 Cumulative_SOC_deviation: 74.0261 Fuel Consumption: 53.7447 Mean: 2.1068, STD: 5.0177
maximum steps, simulation is done ... 
Episode: 39 Total reward: -803.2158641481669 Explore P: 0.3493 SOC: 0.7351 Cumulative_SOC_deviation: 74.7938 Fuel Consumption: 55.2777 Mean: 2.1068, STD: 5.0177
maximum steps, simulation is done ... 
Episode: 40 Total reward: -919.9576960126468 Explore P: 0.3401 SOC: 0.7496 Cumulative_SOC_deviation: 86.3676 Fuel Consumption: 56.2821 Mean: 2.1068, STD: 5.0177
maximum steps, simulation is done ... 
Episode: 41 Total reward: -492.9527690549571 Explore P: 0.3311 SOC: 0.6695 Cumulative_SOC_deviation: 44.2549 Fuel Consumption: 50.4035 Mean: 2.1068, STD: 5.0177


maximum steps, simulation is done ... 
Episode: 78 Total reward: -1022.6403877672004 Explore P: 0.1263 SOC: 0.6298 Cumulative_SOC_deviation: 97.4677 Fuel Consumption: 47.9633 Mean: 2.1068, STD: 5.0179
maximum steps, simulation is done ... 
Episode: 79 Total reward: -992.0525658209082 Explore P: 0.1231 SOC: 0.6277 Cumulative_SOC_deviation: 94.3843 Fuel Consumption: 48.2091 Mean: 2.1068, STD: 5.0179
maximum steps, simulation is done ... 
Episode: 80 Total reward: -1022.0893925224107 Explore P: 0.1200 SOC: 0.6375 Cumulative_SOC_deviation: 97.3041 Fuel Consumption: 49.0483 Mean: 2.1068, STD: 5.0179
maximum steps, simulation is done ... 
Episode: 81 Total reward: -1121.348478247442 Explore P: 0.1171 SOC: 0.6664 Cumulative_SOC_deviation: 106.9870 Fuel Consumption: 51.4784 Mean: 2.1068, STD: 5.0179
maximum steps, simulation is done ... 
Episode: 82 Total reward: -1102.5555639945733 Explore P: 0.1142 SOC: 0.6597 Cumulative_SOC_deviation: 105.1644 Fuel Consumption: 50.9113 Mean: 2.1068, STD: 5.

maximum steps, simulation is done ... 
Episode: 119 Total reward: -624.2866314267153 Explore P: 0.0477 SOC: 0.6763 Cumulative_SOC_deviation: 57.4049 Fuel Consumption: 50.2372 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 120 Total reward: -949.5928040095562 Explore P: 0.0467 SOC: 0.7716 Cumulative_SOC_deviation: 89.1428 Fuel Consumption: 58.1648 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 121 Total reward: -832.589482909017 Explore P: 0.0457 SOC: 0.7445 Cumulative_SOC_deviation: 77.7149 Fuel Consumption: 55.4404 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 122 Total reward: -416.15059178562893 Explore P: 0.0447 SOC: 0.6554 Cumulative_SOC_deviation: 36.7273 Fuel Consumption: 48.8776 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 123 Total reward: -574.4621790594772 Explore P: 0.0438 SOC: 0.6168 Cumulative_SOC_deviation: 52.9008 Fuel Consumption: 45.4542 Mean: 2.1068, STD: 5.

maximum steps, simulation is done ... 
Episode: 160 Total reward: -957.7303726778144 Explore P: 0.0222 SOC: 0.7079 Cumulative_SOC_deviation: 90.2697 Fuel Consumption: 55.0334 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 161 Total reward: -1123.7333995846693 Explore P: 0.0219 SOC: 0.7073 Cumulative_SOC_deviation: 106.8768 Fuel Consumption: 54.9650 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 162 Total reward: -1212.749605267056 Explore P: 0.0216 SOC: 0.6925 Cumulative_SOC_deviation: 115.9115 Fuel Consumption: 53.6342 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 163 Total reward: -1229.3808050333498 Explore P: 0.0213 SOC: 0.6370 Cumulative_SOC_deviation: 118.0155 Fuel Consumption: 49.2257 Mean: 2.1068, STD: 5.0180
maximum steps, simulation is done ... 
Episode: 164 Total reward: -1158.0874519461272 Explore P: 0.0210 SOC: 0.6765 Cumulative_SOC_deviation: 110.6039 Fuel Consumption: 52.0480 Mean: 2.1068, 

KeyboardInterrupt: 

In [None]:
with open("DDQN.pkl", "wb") as f: 
    pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open("results/replay_memory_size_effect.pkl", "rb") as f: 
#     data = pickle.load(f)
    
# data