In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/My Drive/Colab Notebooks/Capstone'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Capstone


In [2]:
from Environments.WM import WMEnv
import torch                                    
import torch.nn as nn                           
import torch.nn.functional as F                 
import numpy as np                              
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import random
from torch.utils.tensorboard import SummaryWriter

In [3]:
# Construct DQN                            

class Net(nn.Module):
    def __init__(self):                                                        
        super(Net, self).__init__()                                            
        self.fc1 = nn.Linear(N_STATES, 100)                                      
        self.fc1.weight.data.normal_(0, 0.1)                                   
        self.out = nn.Linear(100, N_ACTIONS)                                     
        self.out.weight.data.normal_(0, 0.1)                                    

    def forward(self, x):                                                       
        x = F.relu(self.fc1(x))                                                 # relu or leaky_relu
        actions_value = self.out(x)                                            
        return actions_value                                                  



class DQN(object):
    def __init__(self):                                                         
        self.eval_net, self.target_net = Net(), Net()                           
        self.learn_step_counter = 0                                             
        self.memory_counter = 0                                                 
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))             # shape 24000 * 10 interactions are stored in form of (s, a, r, s_)
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)    
        self.loss_func = nn.MSELoss()
        self.loss_memory = []                                          

    def choose_action(self, x):                                                 # Epsilon-Greedy Strategy
        x = torch.unsqueeze(torch.FloatTensor(x), 0)                          
        if np.random.uniform() < EPSILON:                                                
            actions_value = self.eval_net.forward(x)                           
            action = torch.max(actions_value, 1)[1].data.numpy()                 
            action = action[0]                                                 
        else:                                                                   
            action = np.random.randint(0, N_ACTIONS)                         
        return action                                                          

    def store_transition(self, s, a, r, s_):                                   
        transition = np.hstack((s, [a, r], s_))                                 
        index = self.memory_counter % MEMORY_CAPACITY                           
        self.memory[index, :] = transition                                      
        self.memory_counter += 1                                                

    def learn(self):                           
        # Synchronize target net with evaluation net in every TARGET_REPLACE_ITER step                                 
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:                 
            self.target_net.load_state_dict(self.eval_net.state_dict())         
        self.learn_step_counter += 1                                            
        # Calculating q_eval and q_target, and compute the loss to do backpropagation and update weights
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)            # Randomly sample interaction sequences (s, a, r, s_) for learning,
        b_memory = self.memory[sample_index, :]                                 # where s is current state, a is action, r is reward and s_ is new state
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])                         # Extract current state from the memory array
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))    # Extract action from the memory array
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])             # Extract reward from the memory array
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])                       # Extract new state from the memory
        q_eval = self.eval_net(b_s).gather(1, b_a)                              # Compute the q value for taking the action b_a based on the current state b_s
        q_next = self.target_net(b_s_).detach()                                 # Compute the q values for all possible actions based on the new state b_s_
                                                                                # using .detach() to return tensor that does not require gradient so that weights
                                                                                # will not be updated in target net                                                                              
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)           # Compute q_target, similar to bellman equation in tabular q-learning
        loss = self.loss_func(q_eval, q_target)                                 # Compute loss
        self.loss_memory.append(loss.item())                                           # append loss to a list so that we can visualize
        self.optimizer.zero_grad()                                              # The rest code is to do backpropagation and updating weights
        loss.backward()                                                 
        self.optimizer.step()
        writer.add_scalar('training loss', loss.item(), self.learn_step_counter)

In [4]:
# Load dataset
df = pd.read_csv('Datasets/df.csv')
date_list = df.date.unique()
# random.seed(123)
test = date_list[random.sample(range(len(date_list)), 30)]
df_train = df.loc[~df.date.isin(test)]
df_test = df.loc[df.date.isin(test)]

In [None]:
# Define hyperparameters
N_ACTIONS = 2                                   
N_STATES = 6
BATCH_SIZE = 128                              
LR = 0.001                            
EPSILON = 0.7                               
GAMMA_SET = [0.9, 0.95]                                   
TARGET_REPLACE_ITER_SET = [500, 1000]                
MEMORY_CAPACITY_SET = [2400, 24000]                             

cnt = 0

for ga in GAMMA_SET:
  for tri in TARGET_REPLACE_ITER_SET:
    for mc in MEMORY_CAPACITY_SET:
      cnt +=1
      writer = SummaryWriter('runs/exp'+str(cnt)+'|GAMMA-'+str(ga)+'|TRI-'+str(tri)+'|MC-'+str(mc))
      GAMMA = ga
      TARGET_REPLACE_ITER = tri
      MEMORY_CAPACITY = mc

      # Load environment and DQN
      env = WMEnv(df_train, 8, 22, 3, 10, 15, 10, mode = 'training') 
      dqn = DQN() 

      # Start training
      reward = []
      # numope = []
      energy = []
      loss = []
      total_energy = []

      episode = 5000
      for i in range(episode): # each episode contains 24 steps(hours)
          if i % 10 == 0:                                                  
            print('<<<<<<<<<Model ' + str(cnt) + '  Episode: %s' % i)
          s = env.reset()                                               
          episode_reward_sum = 0 
          #set value for epsilon greedy strategy      
          if i < episode//2:
            EPSILON = 0.7
          else:
            EPSILON = 0.99
            LR = 0.0005          
          while True:            
              # Interaction                                        
              a = dqn.choose_action(s)                                    
              s_, r, done, info = env.step(a)                                
              # Save experience sequences
              dqn.store_transition(s, a, r, s_)                 
              episode_reward_sum += r                           
              s = s_                                                

              if dqn.memory_counter > MEMORY_CAPACITY:
                dqn.learn()

              if done: 
                energy.append(env.total_energy_cost)
                reward.append(round(episode_reward_sum,2))
                writer.add_scalar('Daily total reward', round(episode_reward_sum), i)
                # numope.append(env.cur_ope)
                total_energy.append(env.total_net_energy_cost)
                if i == episode - 1:
                  print('saving model')
                  torch.save(dqn.eval_net.state_dict(), 'weights/dqn_eval_weights_v'+str(cnt)+'.pth')        # Saving eval_net weights as .pth file
                  torch.save(dqn.target_net.state_dict(), 'weights/dqn_target_weights_v'+str(cnt)+'.pth')    # Saving target_net weights as .pth file
                break

<<<<<<<<<Model 1  Episode: 0
<<<<<<<<<Model 1  Episode: 10
<<<<<<<<<Model 1  Episode: 20
<<<<<<<<<Model 1  Episode: 30
<<<<<<<<<Model 1  Episode: 40
<<<<<<<<<Model 1  Episode: 50
<<<<<<<<<Model 1  Episode: 60
<<<<<<<<<Model 1  Episode: 70
<<<<<<<<<Model 1  Episode: 80
<<<<<<<<<Model 1  Episode: 90
<<<<<<<<<Model 1  Episode: 100
<<<<<<<<<Model 1  Episode: 110
<<<<<<<<<Model 1  Episode: 120
<<<<<<<<<Model 1  Episode: 130
<<<<<<<<<Model 1  Episode: 140
<<<<<<<<<Model 1  Episode: 150
<<<<<<<<<Model 1  Episode: 160
<<<<<<<<<Model 1  Episode: 170
<<<<<<<<<Model 1  Episode: 180
<<<<<<<<<Model 1  Episode: 190
<<<<<<<<<Model 1  Episode: 200
<<<<<<<<<Model 1  Episode: 210
<<<<<<<<<Model 1  Episode: 220
<<<<<<<<<Model 1  Episode: 230
<<<<<<<<<Model 1  Episode: 240
<<<<<<<<<Model 1  Episode: 250
<<<<<<<<<Model 1  Episode: 260
<<<<<<<<<Model 1  Episode: 270
<<<<<<<<<Model 1  Episode: 280
<<<<<<<<<Model 1  Episode: 290
<<<<<<<<<Model 1  Episode: 300
<<<<<<<<<Model 1  Episode: 310
<<<<<<<<<Model 1  E

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

In [None]:
# sns.lineplot(x = list(range(len(reward))), y = reward)

In [None]:
# sns.lineplot(x = list(range(episode)), y = energy)

In [None]:
# l = [i.item() for i in dqn.loss_memory]
sns.lineplot(x=list(range(len(dqn.loss_memory))), y = dqn.loss_memory)

In [None]:
# BATCH_SIZE = 64                               
# LR = 0.1                                   
# EPSILON = 0.99                               
# GAMMA = 0.9                                     
# TARGET_REPLACE_ITER = 1000                 
# MEMORY_CAPACITY = 2400                                 
# N_ACTIONS = 2                                   
# N_STATES = 5

# # Load environment and DQN
# env = WMEnv(df, 8, 22, 5, 10, 15, 10) 
# dqn = DQN() 

# # Load previous training weights 
# dqn.eval_net.load_state_dict(torch.load('dqn_eval_weights_v1.pth'))        
# dqn.target_net.load_state_dict(torch.load('dqn_target_weights_v1.pth')) 

In [None]:
# Testing
date = []
time = []
price = []
en_cost = []
total_en = []
generation = []
fixed = []
net = []

env = WMEnv(df_test, 8, 22, 5, 10, 15, 10, mode = 'testing')

# Randomly sample 5 days for testing
for episode in range(len(test)): 
  s = env.reset()
  EPSILON = 1
  while True:
    action = dqn.choose_action(s)

    en_cost.append(env.cur_energy)
    date.append(env.cur_date)
    time.append(env.time)
    price.append(env.price)
    generation.append(env.generation)
    fixed.append(env.fixed_cost)
    net.append(env.fixed_cost + env.cur_energy - env.generation)

    s_, rewards, done, info = env.step(action) 
    #env.render(action)
    s = s_
    if done:
      total_en.append(env.total_energy_cost)
      break

In [None]:
re = pd.DataFrame({'date':date, 't':time, 'energy_cost':en_cost, 'price':price, 'generation':generation, 
                   'fixed': fixed, 'net': net})
d = list(re.date.unique())
fig, axes = plt.subplots(30,1,figsize = (15,90))
plt.subplots_adjust(hspace = 0.5)
for i in range(len(test)):
  axes2 = axes[i].twinx()
  axes[i].plot(re[re.date == d[i]].t, re[re.date == d[i]].price, color = 'grey')
#  axes2.bar(re[re.date == d[i]].t, re[re.date == d[i]].energy_cost, color = 'skyblue', alpha = 0.9)
  axes2.plot(re[re.date == d[i]].t, re[re.date == d[i]].generation, color = 'orange')
  axes2.plot(re[re.date == d[i]].t, re[re.date == d[i]].fixed, color = 'red')
  axes2.plot(re[re.date == d[i]].t, re[re.date == d[i]].net, color = 'purple')
  # axes2.set_ylim([0, 1.5])
  axes[i].grid()
  try:
    axes[i].title.set_text('Deep Q-Network\n date: ' + d[i] + ' WM operation time: ' + str(re[(re.date == d[i]) & (re.energy_cost >= 1)].t.values[0]))
  except:
    axes[i].title.set_text('Deep Q-Network\n date: ' + d[i] + ' WM operation time: ' + 'No operation')
  # axes[i].title.set_text(d[i] + ' The total energy of all appliances = ' + str(total_en[i]))
  axes[i].legend(['TOU price'], loc = 'upper left')
  axes2.legend(['PV Generation', 'fixed', 'net_cost'], bbox_to_anchor=(1.2, 1.06), loc="upper right")
  # axes2.legend(['PV Generation', 'fixed', 'net_cost', 'Washing Machine Energy Consumption'], bbox_to_anchor=(1.35, 1.06), loc="upper right")
  plt.xticks(np.arange(0,24))
plt.show()