In [1]:
% load_ext autoreload
% autoreload 2
import tensorflow as tf
import numpy as np
from model.cifar_utils import load_data
import time
import os 
from model.funcs import *
from model.objective_function import *
from model.DQN import DQN

In [2]:
# data for objective net
X_train, y_train, X_test, y_test = load_data()
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

mean_image = np.mean(X_train, axis=0)
X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_test = X_test.astype(np.float32) - mean_image

Training data shape:  (50000, 3072)
Training labels shape:  (50000,)
Test data shape:  (10000, 3072)
Test labels shape:  (10000,)


In [3]:
# create Objective Net and get necessary operations
with tf.name_scope('obj_inputs'):
    ob_xs = tf.placeholder(shape=[None,3072],dtype=tf.float32)
    ob_ys = tf.placeholder(shape=[None,],dtype=tf.int64)
    
    
train_feed_dict = {ob_xs:X_train,ob_ys:y_train}
val_feed_dict = {ob_xs:X_test,ob_ys:y_test}

obj_model = ObjectiveNets(input_dim=3072,hidden_dims=[100],num_classes=10)
ob_ws = tf.placeholder(shape=[None,None],dtype=tf.float32) # used in assign ops
ob_lr = tf.placeholder(shape=[],dtype=tf.float32)  # used in assign ops


# initial learning rate
lr_c = 1
with tf.variable_scope('ObjNets',reuse=tf.AUTO_REUSE):
    obj_lr = tf.get_variable(name='learning_rate',shape=[],initializer=tf.constant_initializer(lr_c,dtype=tf.float32))
    lr_sum = tf.summary.scalar('learning_rate',obj_lr)

# get loss,weight,gradient
obj_ls = obj_model.loss(ob_xs,ob_ys)
obj_w = obj_model.weights
obj_g = obj_model.grads

# get update and evaluate op
obj_model.update(obj_lr)
obj_up = obj_model.updates
obj_eva,_ = obj_model.evaluate(ob_xs,ob_ys)
eva_sum = tf.summary.scalar('val_acc',obj_eva)

# get summary
obj_sum = obj_model.summary
obj_merge = tf.summary.merge([obj_sum,lr_sum])

In [4]:
# create DQN
with tf.name_scope('dqn_inputs'):
    dqn_xs = tf.placeholder(shape=[None,6],dtype=tf.float32)
    dqn_ys = tf.placeholder(shape=[None,2],dtype=tf.float32)
    
dqn = DQN(input_dim=6,hidden_dims=[32],num_classes=2)

# get prediction,loss,update
dqn_fw = dqn.forward(dqn_xs)
dqn_ls = dqn.loss(dqn_xs,dqn_ys)
#dqn.update(0.001)
#dqn_up = dqn.updates

# get summary
dqn_sum = dqn.summary
dqn_merge = tf.summary.merge(dqn_sum)

In [5]:
# realize Q-learning with Experience Replay
# initialization
sess = tf.InteractiveSession()
global_step = tf.Variable(0,name='global_step',trainable=False)

# dqn update using adam gradient descent
train_dqn = tf.train.AdamOptimizer(0.05).minimize(dqn_ls,global_step=global_step)

log_dir = os.path.abspath('.') + '/log/'
model_name = 'model_{}'.format(int(time.time()))
writer = tf.summary.FileWriter(log_dir+model_name,sess.graph)
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

In [6]:
# basic parameters
episodes = 200
T = 100
epsilon = 0.2
M = 3
fg = FeatureGenerator(M)
A = 1000
EM = Experience_Memory(A)
batch_size = 64

In [None]:
# initialize x_
x_init = sess.run(obj_w,feed_dict=train_feed_dict)

In [None]:
rewards_sum = []
for e in range(episodes):
    reward_per_ep = []
    with tf.variable_scope('ObjNets',reuse=tf.AUTO_REUSE):
        for i in range(len(x_init)):
            weight_name = 'weight_{}'.format(i)
            w_old = tf.get_variable(weight_name)
            sess.run(w_old.assign(ob_ws),feed_dict={ob_ws:x_init[weight_name]})
            
    # validate if reseting is successful
    assert np.allclose(w_old.eval(),x_init['weight_{}'.format(i)])
    
    # reset learning rate to lr_c
    sess.run(obj_lr.assign(lr_c))
    assert np.allclose(sess.run(obj_lr),lr_c)
    
    # epsilon decay
    if epsilon > 0.05:
        epsilon = 0.2 - e*(0.2 - 0.05) / 10
    print('episode {}  epsilon:{}'.format(e,epsilon))
#     update for the first M times
    for t in range(M):
        ls_t,g_t,lr_t = sess.run([obj_ls,obj_g,obj_lr],feed_dict=train_feed_dict)  
        fg.loss_memory_update(ls_t)
        fg.ali_update(g_t)
        # update x
        sess.run(obj_up,feed_dict=train_feed_dict)
        
    # from M to T, train dqn
    x_,dx_,ls_ = sess.run([obj_w,obj_g,obj_ls],feed_dict=train_feed_dict)
    for t in range(T):
        # generate state feature vector s_t
        ls_t,g_t,lr_t,w_t = sess.run([obj_ls,obj_g,obj_lr,obj_w],feed_dict=train_feed_dict)
        s_t = fg.generate_feature(t,lr_t,ls_t,g_t)
        fg.loss_memory_update(ls_t)
        fg.ali_update(g_t)
        
#         print('*'*10)
#         print('episode {}/step {} objective_nets loss:{}'.format(e,t,ls_t))
        
        # time t dqn predict 
        forward_t = sess.run(dqn_fw,feed_dict={dqn_xs:s_t})
        # time t action
        action_t = e_greedy(forward_t,epsilon)
        # action: 0 for half, 1 for keep
        if action_t == 0:
            lr_tn = 0.5*lr_t
        else:
            x_,dx_ = w_t,g_t
            ls_ = ls_t
            lr_tn = lr_c
        sess.run(obj_lr.assign(ob_lr),feed_dict={ob_lr:lr_tn})
#         print('episode {}/step {} current learning rate:{}'.format(e,t,obj_lr.eval()))
        
        # update w to next state
        with tf.variable_scope('ObjNets',reuse=tf.AUTO_REUSE):
            for i in range(len(x_)):
                weight_name = 'weight_{}'.format(i)
                grad_name = 'grad_{}'.format(i)
                w_ = tf.get_variable(weight_name)
                w_new = x_[weight_name] - lr_tn*dx_[grad_name]
                sess.run(w_.assign(ob_ws),feed_dict={ob_ws:w_new})
                
        # time t+1 feature s_tn
        ls_tn,g_tn,lr_tn = sess.run([obj_ls,obj_g,obj_lr],feed_dict=train_feed_dict)
        s_tn = fg.generate_feature(t+1,lr_tn,ls_tn,g_tn)
        fg.loss_memory_update(ls_tn)
        fg.ali_update(g_tn)
        
        if action_t == 0:
            reward_t = reward_function(ls_tn)
        else:
            reward_t = reward_function(ls_,c=0.12)
#         print('episode {}/step {} current state reward:{}'.format(e,t,reward_t))
        reward_per_ep.append(reward_t)
        # add experience to memory
        if t == T - 1:
            forward_tn = None
        else:
            forward_tn = sess.run(dqn_fw,feed_dict={dqn_xs:s_tn})
        labels_t = DQN_labels(forward_t,action_t,reward_t,forward_tn,gamma=0.99)
        EM.add_experience(s_t,action_t,reward_t,s_tn,labels_t)
#         print('episode {}/step {} Experience memory:{}'.format(e,t,EM.memory.qsize()))
        
        # get experience batch
        experience_batch = EM.get_experience(batch_size)
        # get inputs and labels for training DQN
        states_trainDQN = get_csf_from_experience(experience_batch)
        labels_trainDQN = get_labels_from_experience(experience_batch)
        
        # normalize state features
        max_features = np.max(states_trainDQN,axis=0)
        min_features = np.min(states_trainDQN,axis=0)
        
        normalized_features = 1 - 2*(states_trainDQN-min_features)/(max_features-min_features+1e-10)
        
        ls_dqn,_ = sess.run([dqn_ls,train_dqn],feed_dict={dqn_xs:normalized_features,dqn_ys:labels_trainDQN})
#         print('DQN loss:{}'.format(ls_dqn))
#         print('*'*10)
        
        obj_sum = sess.run(obj_merge,feed_dict=train_feed_dict)
        dqn_sum = sess.run(dqn_merge,feed_dict={dqn_xs:normalized_features,dqn_ys:labels_trainDQN})

        
        if t % 50 == 0 or (t+1)==T:
            obj_val,obj_eva_sum = sess.run([obj_eva,eva_sum],feed_dict=val_feed_dict)
#             print('episode {}/step {} val_acc:{}'.format(e,t,obj_val))
#             print('*'*10)
            writer.add_summary(obj_eva_sum,e*T+t)
        if t + 1 == T:
            checkpoint_path = os.path.join(log_dir+model_name,'my_DQN.ckpt')
            saver.save(sess,checkpoint_path,global_step=global_step)
            
        writer.add_summary(obj_sum,e*T+t)
        writer.add_summary(dqn_sum,e*T+t)
        
    print('episode {} mean reward:{}'.format(e,np.mean(reward_per_ep)))
    rewards_sum.append(np.mean(reward_per_ep))
    
sess.close()

episode 0  epsilon:0.2
episode 0 mean reward:0.05017929087122718
episode 1  epsilon:0.185
episode 1 mean reward:0.05181982501036267
episode 2  epsilon:0.17
episode 2 mean reward:0.05100051264967341
episode 3  epsilon:0.155
episode 3 mean reward:0.04998419615148851
episode 4  epsilon:0.14
episode 4 mean reward:0.051638724167718024
episode 5  epsilon:0.125
episode 5 mean reward:0.0491439898575212
episode 6  epsilon:0.11
episode 6 mean reward:0.0507019265703145
episode 7  epsilon:0.09499999999999999
episode 7 mean reward:0.0490632008242719
episode 8  epsilon:0.07999999999999999
episode 8 mean reward:0.04875857834289673
episode 9  epsilon:0.065
episode 9 mean reward:0.048842652701486154
episode 10  epsilon:0.04999999999999999
episode 10 mean reward:0.047952806863573415
episode 11  epsilon:0.04999999999999999
episode 11 mean reward:0.05039105895780759
episode 12  epsilon:0.04999999999999999
episode 12 mean reward:0.049349299740161834
episode 13  epsilon:0.04999999999999999
episode 13 mean r