## Loading libraries

In [1]:
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

def load_data(FileName):
    xls_path = os.path.join(FileName)
    return pd.ExcelFile(xls_path)

data = pd.read_excel(load_data("trace.xlsx"), '1')
data = shuffle(data)

## Data preprocessing

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.externals.joblib import dump
from sklearn.preprocessing import StandardScaler

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
X_train = train_set.iloc[:,0:12]
y_train = train_set.iloc[:,12]

X_test = test_set.iloc[:,0:12]
y_test = test_set.iloc[:,12]

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test =  scaler.transform(X_test)

train_data = X_train
train_data_label = y_train

In [19]:
import sklearn.preprocessing

scaler = sklearn.preprocessing.StandardScaler()
train_data = scaler.fit_transform(train_data)

#save the encoder for tester
dump(scaler, 'std_scaler.bin', compress=True)

<class 'numpy.ndarray'>


['std_scaler.bin']

In [None]:
class State:
    
    def __init__(self):
        
        device_specs = pd.DataFrame()
  
        # append columns to an empty DataFrame
        device_specs['Name'] = ['fridge_01', 'smart_tv_01', 'pc_01']
        device_specs['total_memory'] = [97, 600, 200]
        device_specs['available_memory'] = [2200, 775, 300]
        
        self.state = {
            "task_id": "task_0001",
            "device_specs": device_specs,
        }
    
    def get_state(self):
        
        return self.state
        
    def update_state(self, task_id, ):
        
        self.state = get_data_from_devies()

In [23]:
class Environment1:
    
    def __init__(self, data, label):
        self.data = data
        self.label = label
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        
        return self.data[self.t, :]
    
    def step(self, act):
        reward = 0
        
        # act = 0: unoccupied, 1: occupied
        if act == self.label.iloc[self.t]:
            reward +=1 
        else:
            reward -=1
  
        # set next time
        self.t += 1
        
        return self.data[self.t, :], reward, self.done # obs, reward, done

In [24]:
import time
import copy
import chainer
import chainer.functions as F
import chainer.links as L
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl
import numpy as np

def train_dqn(env):

    class Q_Network(chainer.Chain):

        def __init__(self, input_size, hidden_size, output_size):
            super(Q_Network, self).__init__(
                fc1 = L.Linear(input_size, hidden_size),
                fc2 = L.Linear(hidden_size, hidden_size),
                fc3 = L.Linear(hidden_size, output_size)
            )

        def __call__(self, x):
            h = F.relu(self.fc1(x))
            h = F.relu(self.fc2(h))
            y = self.fc3(h)
            return y

        def reset(self):
            self.zerograds()

    Q = Q_Network(input_size=12, hidden_size=100, output_size=2) # 4 features, 2 actions
    Q_ast = copy.deepcopy(Q)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(Q)

    epoch_num = 20
    step_max = len(env.data)-1
    memory_size = 800
    batch_size = 20
    epsilon = 1.0
    epsilon_decrease = 1e-3
    epsilon_min = 0.1
    start_reduce_epsilon = 200
    train_freq = 10
    update_q_freq = 20
    gamma = 0.97
    show_log_freq = 5

    memory = []
    total_step = 0
    total_rewards = []
    total_losses = []

    start = time.time()
    for epoch in range(epoch_num):

        pobs = env.reset()
        step = 0
        done = False
        total_reward = 0
        total_loss = 0

        while not done and step < step_max:

            # select act
            pact = np.random.randint(2)
            if np.random.rand() > epsilon:
                pact = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
                pact = np.argmax(pact.data)

            # act
            obs, reward, done = env.step(pact)

            # add memory
            memory.append((pobs, pact, reward, obs, done))
            if len(memory) > memory_size:
                memory.pop(0)   
            
            # train or update q
            if len(memory) == memory_size:
                if total_step % train_freq == 0:
                    shuffled_memory = np.random.permutation(memory)
                    memory_idx = range(len(shuffled_memory))
                    for i in memory_idx[::batch_size]:
                        batch = np.array(shuffled_memory[i:i+batch_size])
                        
                        b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                        
                        b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                        b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                        q = Q(b_pobs)
                        maxq = np.max(Q_ast(b_obs).data, axis=1)
                        target = copy.deepcopy(q.data)
                        for j in range(batch_size):
                            target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                        Q.reset()
                        loss = F.mean_squared_error(q, target)
                        total_loss += loss.data
                        loss.backward()
                        optimizer.update()

                if total_step % update_q_freq == 0:
                    Q_ast = copy.deepcopy(Q)

            # epsilon
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease

            # next step
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1

        total_rewards.append(total_reward)
        total_losses.append(total_loss)

        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            elapsed_time = time.time()-start
            print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
            start = time.time()
            
    return Q, total_losses, total_rewards

In [None]:
Q, total_losses, total_rewards = train_dqn(Environment1(train_data,train_data_label))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
5	0.0999999999999992	35035	1412.6	8022.357741842791	868.3909432888031
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
10	0.0999999999999992	70070	1596.6	9739.635128593445	933.5282363891602
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
15	0.0999999999999992	105105	1958.6	10680.776064751297	907.0877966880798
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
from chainer import serializers
serializers.save_npz('SavedModels/Q.model', Q)

In [None]:
def plot_loss_reward(total_losses, total_rewards):

    figure = tools.make_subplots(rows=1, cols=2, subplot_titles=('loss', 'reward'), print_grid=False)
    figure.append_trace(Scatter(y=total_losses, mode='lines', line=dict(color='skyblue')), 1, 1)
    figure.append_trace(Scatter(y=total_rewards, mode='lines', line=dict(color='orange')), 1, 2)
    figure['layout']['xaxis1'].update(title='epoch')
    figure['layout']['xaxis2'].update(title='epoch')
    figure['layout'].update(height=400, width=900, showlegend=False)
    iplot(figure)

plot_loss_reward(total_losses, total_rewards)