## Q-learning for washing machine (Yunzhe and Weisheng)

In [1]:
import numpy as np
from decimal import Decimal


### State Space

In [28]:
# Define the states
def callspace(lower,upper,interval,decimal):
    states = {}
    for i in range (24):
        consumption = np.arange(lower, upper, interval)
        for each in consumption:
            if i >= 12:
                key = "(" + str((i)) + " PM, " + str(round(each,decimal)) + ")"
                states[key] = [i, round(each,decimal)]
            else:
                key = "(" + str(i) + " AM, " + str(round(each,decimal)) + ")"
                states[key] = [i, round(each,decimal)]
    return states

In [39]:
time_to_state = callspace(0.00,1.28,0.01,2)
#time_to_state

### Action space

In [30]:
# 0 defined for turn off and 1 defined for turn on
actions = [0,1]

### Reward formula

In [31]:
# Q tabel 
def createQ(total):
    return np.array(np.zeros([total,2]))

Q = createQ(len(time_to_state))
print(Q.shape)

(3072, 2)


In [32]:
state_to_time = dict((tuple(state),time) for time,state in time_to_state.items())

In [33]:
gamma = 0.75 # Discount factor 
alpha = 0.9 # Learning rate 

In [34]:
class QAgent():
    # Initialize alpha, gamma, states, actions, rewards, and Q-values
    def __init__(self, alpha, gamma, time_to_state, actions, state_to_time, Q, price, energy_cost, penalty, t_prefer_s, t_prefer_e, max_limit, interval, lower,upper,decimal):
        
        self.gamma = gamma  
        self.alpha = alpha 
        self.time_to_state = time_to_state
        self.actions = actions
        self.state_to_time = state_to_time
        self.Q = Q
        self.price = price
        self.energy_cost = energy_cost
        self.penalty = penalty
        self.t_prefer_s = t_prefer_s
        self.t_prefer_e = t_prefer_e
        self.max_limit = max_limit
        self.interval = interval
        self.lower = lower
        self.upper = upper
        self.decimal = decimal
        
    def calReward(self, current_state, time, energy):
        fix_cost = self.price[time] * energy
        if energy > self.max_limit:
            if time < self.t_prefer_s:
                return -(fix_cost + self.penalty * (self.t_prefer_s - time) + 999*(energy - self.max_limit))
            elif time > self.t_prefer_e:
                return -(fix_cost + self.penalty * (time - self.t_prefer_e)+ 999*(energy - self.max_limit))
            else:
                return -(fix_cost + 999*(energy - self.max_limit)) 
        else:
            if time < self.t_prefer_s:
                return -(fix_cost + self.penalty * (self.t_prefer_s - time))
            elif time > self.t_prefer_e:
                return -(fix_cost + self.penalty * (time - self.t_prefer_e))
            else:
                return -(fix_cost) 
        
        
        
    def training (self, start_state, end_state,iterations):
        for i in range(iterations):
            current_state = np.random.randint(0,self.Q.shape[0]) 
            playable_actions = [0,1]
            time = int(current_state / self.interval)
            consumption = (current_state % self.interval) * self.decimal + self.lower
            next_action = np.random.choice(playable_actions)
            if time >= 22:
                next_state =  current_state % self.interval
            else:
                next_state = current_state +  self.interval
            reward = self.calReward(next_state, time, consumption)
            TD = reward + self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - self.Q[current_state,next_action]
            self.Q[current_state,next_action] += self.alpha * TD

        self.get_optimal_action(start_state, end_state)
        
    def print_welcome(self, idx):
        if idx == 0:
            print("------------------------------------")
            print("|        WELCOME TO Q-Learning        |")
            print("------------------------------------")
        elif idx == 1:
            print("t -     STATE  -  ACTION")
            print("================================")
        
    def get_optimal_action(self,start_state, end_state):
        route = []
        self.print_welcome(1)
        count = 0
        
        start_point = time_to_state[start_state][0]
        end_point = time_to_state[end_state][0]
        current_state = time_to_state[start_state]
        while(start_point != end_point):
            #print(((self.upper - self.lower)/self.interval))
            #print(current_state)
            rows = int(current_state[0]*self.interval + (current_state[1] - self.lower) / ((self.upper - self.lower)/(self.interval - 1)))
            #print(rows)
            next_action = np.argmax(self.Q[rows,])
            names_state = state_to_time[tuple(current_state)]
            if next_action == 0:
                print(count," -", names_state, "Turn Off")
            else:
                print(count," -", names_state, "Turn On")
            
            current_state = [current_state[0] + 1, self.energy_cost[start_point + 1]]
            start_point += 1
            count += 1
        #print(route)

In [38]:
price = [1.23, 2.34,1.45,3.42,2.476,3.21,1.77,2.31,2.35,3.22,1.45,3.23,3.56,2,65,2,78,4.11,4.23,2,21,2.18,2.19,2.78]
energy_cost = [0.0,0.0,0.0,0.0, 0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.16,0.54,0.13,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
penalty = 0.7
t_prefer_s = 11
t_prefer_e = 13
qagent = QAgent(alpha, gamma, time_to_state, actions, state_to_time, Q, price, energy_cost, penalty, t_prefer_s, t_prefer_e, 555, 129, 0.00,1.28,0.01)
qagent.print_welcome(0)
qagent.training('(0 AM, 0.0)', '(23 PM, 0.0)', 10000)

------------------------------------
|        WELCOME TO Q-Learning        |
------------------------------------
t -     STATE  -  ACTION
0  - (0 AM, 0.0) Turn On
1  - (1 AM, 0.0) Turn On
2  - (2 AM, 0.0) Turn On
3  - (3 AM, 0.0) Turn Off
4  - (4 AM, 0.0) Turn On
5  - (5 AM, 0.0) Turn On
6  - (6 AM, 0.0) Turn Off
7  - (7 AM, 0.0) Turn Off
8  - (8 AM, 0.0) Turn Off
9  - (9 AM, 0.0) Turn Off
10  - (10 AM, 0.5) Turn Off
11  - (11 AM, 0.16) Turn Off
12  - (12 PM, 0.54) Turn Off
13  - (13 PM, 0.13) Turn On
14  - (14 PM, 0.0) Turn On
15  - (15 PM, 0.0) Turn On
16  - (16 PM, 0.0) Turn On
17  - (17 PM, 0.0) Turn On
18  - (18 PM, 0.0) Turn On
19  - (19 PM, 0.0) Turn On
20  - (20 PM, 0.0) Turn Off
21  - (21 PM, 0.0) Turn Off
22  - (22 PM, 0.0) Turn On
