-
Notifications
You must be signed in to change notification settings - Fork 0
/
DQN.py
289 lines (235 loc) · 14.3 KB
/
DQN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import pickle
import matplotlib as mp
mp.use('Agg')
from matplotlib import pyplot as plt
#import cv2
from skimage.transform import resize
from skimage.color import rgb2gray
class DQN_AGENT:
def __init__(self, flags):
self.flags = flags
self.sess = tf.Session()
#set seed in numpy/TensorFlow
tf.set_random_seed(flags.seed)
np.random.seed(flags.seed)
self.current_state_pl = tf.placeholder(tf.float32, shape=(None, flags.frame_dim, flags.frame_dim, flags.num_frame))
self.newstate_pl = tf.placeholder(tf.float32, shape=(None, flags.frame_dim, flags.frame_dim, flags.num_frame))
self.rewards_pl = tf.placeholder(tf.float32, shape=(None))
self.action_mask_pl = tf.placeholder(tf.float32, shape=(None, flags.num_action))
self.finished_pl = tf.placeholder(tf.float32, shape=(None))
self.action_value_network = self.__init_network(self.current_state_pl, 'Qvars')
self.target_network = tf.stop_gradient(self.__init_network(self.newstate_pl, 'target'))
self.__init_training()
self.__init_memory()
self.__init_state()
self.updateTargetNetwork()
self.input_states = [np.zeros((flags.frame_dim, flags.frame_dim), dtype = 'float32')]*flags.num_frame
def __init_network(self, input_frame, collection):
flags = self.flags
#network layer shapes
CONV1_SHAPE = (8, 8, flags.num_frame, 32)
CONV2_SHAPE = (4, 4, 32, 64)
CONV3_SHAPE = (3, 3, 64, 64)
FC1_SHAPE = (512)
with tf.variable_scope(collection):
with tf.variable_scope('conv1'):
kernel = tf.get_variable('weights', shape=CONV1_SHAPE,
initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform=True))
conv = tf.nn.conv2d(input_frame, kernel, [1,4,4,1], padding = 'SAME')
biases = tf.get_variable('biases', shape=CONV1_SHAPE[-1],
initializer=tf.random_uniform_initializer(minval=0,
maxval=flags.bias, dtype=tf.float32))
hidden1 = tf.nn.relu(tf.nn.bias_add(conv,biases))
with tf.variable_scope('conv2'):
kernel=tf.get_variable('weights', shape=CONV2_SHAPE,
initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform = True))
conv = tf.nn.conv2d(hidden1, kernel, [1,2,2,1], padding = 'SAME')
biases = tf.get_variable('biases', shape=CONV2_SHAPE[-1],
initializer=tf.random_uniform_initializer(minval=0,
maxval=flags.bias, dtype=tf.float32))
hidden2 = tf.nn.relu(tf.nn.bias_add(conv,biases))
with tf.variable_scope('conv3'):
kernel=tf.get_variable('weights', shape=CONV3_SHAPE,
initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform = True))
conv = tf.nn.conv2d(hidden2, kernel, [1,1,1,1], padding = 'SAME')
biases = tf.get_variable('biases', shape=CONV3_SHAPE[-1],
initializer=tf.random_uniform_initializer(minval=0,
maxval=flags.bias, dtype=tf.float32))
hidden3 = tf.nn.relu(tf.nn.bias_add(conv,biases))
with tf.variable_scope('fc1'):
dims = [hidden3.get_shape()[i].value for i in range(1,len(hidden3.get_shape()))]
reshape_hidden3 = tf.reshape(hidden3, [-1, np.prod(dims)])
weights = tf.get_variable('weights', shape=[reshape_hidden3.get_shape()[1], FC1_SHAPE],
initializer=tf.contrib.layers.xavier_initializer(uniform = True))
biases = tf.get_variable('biases', shape=FC1_SHAPE,
initializer=tf.random_uniform_initializer(minval=0,
maxval=flags.bias, dtype=tf.float32))
hidden4 = tf.nn.relu(tf.matmul(reshape_hidden3, weights) + biases)
with tf.variable_scope('action_value'):
weights = tf.get_variable('weights', shape=[hidden4.get_shape()[1], flags.num_action],
initializer=tf.contrib.layers.xavier_initializer(uniform=True))
biases=tf.get_variable('biases', shape=[flags.num_action],
initializer=tf.constant_initializer(value=flags.bias, dtype=tf.float32))
action_value=tf.matmul(hidden4,weights) + biases
return action_value
def __init_training(self):
flags = self.flags
self.greedy_action = tf.argmax(self.action_value_network, dimension=1)
Qvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Qvars')
target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target')
self.target_network_update = [tvar.assign(qvar) for qvar,tvar in zip(Qvars, target_vars)]
max_one_step = self.rewards_pl + flags.gamma*tf.reduce_max(self.target_network, reduction_indices=[1,])*self.finished_pl
masked_action_qvals = tf.reduce_sum(self.action_value_network*self.action_mask_pl, reduction_indices=[1,])
Q_loss = tf.reduce_mean(tf.square(max_one_step - masked_action_qvals))
if flags.opt_type is 'RMSprop':
optimizer = tf.train.RMSPropOptimizer(flags.lr, decay=flags.decay, momentum=flags.momentum, epsilon=flags.opt_eps)
elif flags.opt_type is 'Adam':
optimizer = tf.train.AdamOptimizer(flags.lr, beta1=flags.beta1, beta2=flags.beta2, epsilon=flags.opt_eps)
Qgrads = optimizer.compute_gradients(Q_loss, var_list=Qvars)
Qgrads = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in Qgrads]
self.apply_Qgrads = optimizer.apply_gradients(Qgrads)
def __init_memory(self):
flags = self.flags
self.state_buffer = np.empty((flags.buffer_size, flags.frame_dim, flags.frame_dim), dtype='float16')
self.action_buffer = np.empty(flags.buffer_size, dtype='uint8')
self.reward_buffer = np.empty(flags.buffer_size, dtype='int16')
self.finished_buffer = np.empty(flags.buffer_size, dtype='uint8')
self.buffer_count = 0
self.buffer_index = 0
self.current_states = np.empty((flags.batch_size, flags.num_frame, flags.frame_dim, flags.frame_dim), dtype='float32')
self.new_states = np.empty((flags.batch_size, flags.num_frame, flags.frame_dim, flags.frame_dim), dtype='float32')
self.sample_inds = np.empty(flags.batch_size, dtype='int32')
def __init_state(self):
flags = self.flags
self.saver = tf.train.Saver(max_to_keep=1)
if flags.resume:
self.saver.restore(self.sess, "/tmp/Qmodel.ckpt")
fh = open('/tmp/state_info.pkl', 'rb')
self.epsilon, self.update_num, self.action_num, self.reward_list, \
self.running_reward_list, self.running_reward, \
self.episode_num = pickle.load(fh)
fh.close()
else:
init = tf.initialize_all_variables()
self.sess.run(init)
self.epsilon = flags.eps_init
self.update_num = 1
self.action_num = 0
self.reward_list = []
self.running_reward_list = []
self.running_reward = None
self.episode_num = 0
self.sess.run(init)
def initGame(self, env):
flags = self.flags
self.reward_sum=0
observation = env.reset()
self.start_lives = env.ale.lives()
self.input_states = [self.preprocess(observation).astype('float32')]*flags.num_frame
return observation
# def preprocess(self, screen): #as in devsisters/DQN-tensorflow
# flags = self.flags
# return cv2.resize(cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)/255.,
# (flags.frame_dim, flags.frame_dim))
def preprocess(self, screen):
flags = self.flags
out_screen = resize(rgb2gray(screen), (110, flags.frame_dim))[16:-10,:]
return out_screen
def getState(self, index, buffer_count, buffer_index, state_buffer):
flags = self.flags
index = index % buffer_count
if index >= flags.num_frame - 1:
# use faster slicing
return state_buffer[(index - (flags.num_frame - 1)):(index + 1),:,:]
else:
# otherwise normalize indexes and use slower list based access
indexes = [(index - i) % buffer_count for i in reversed(range(flags.num_frame))]
return state_buffer[indexes,:,:]
def updateTargetNetwork(self):
self.sess.run(self.target_network_update)
def observeState(self, observation):
self.state = self.preprocess(observation)
self.input_states.append(self.state.astype('float32'))
del self.input_states[0]
self.formatted_input = np.stack(self.input_states, axis = 2)
self.formatted_input = np.reshape(self.formatted_input, (1,) + self.formatted_input.shape)
def chooseAction(self):
flags = self.flags
self.action_num+=1
if np.random.rand() < self.epsilon:
action = np.random.randint(0, flags.num_action)
else:
action = self.sess.run(self.greedy_action, feed_dict = {self.current_state_pl:self.formatted_input})[0]
return action
def takeAction(self, env, action):
self.lives = env.ale.lives()
observation, reward, done, info = env.step(action)
if self.lives > env.ale.lives():
reward -= 1
return observation, reward, done
def annealExplore(self):
flags = self.flags
if self.epsilon > flags.eps_final and self.buffer_count > flags.start_train:
self.epsilon -= (flags.eps_init - flags.eps_final)/flags.anneal
def storeReplay(self, action, reward, done):
flags = self.flags
self.state_buffer[self.buffer_index,:,:] = self.state
self.action_buffer[self.buffer_index] = action
self.reward_buffer[self.buffer_index] = np.clip(reward, -1, 1)
self.finished_buffer[self.buffer_index] = done
self.buffer_count = max(self.buffer_count, self.buffer_index + 1)
self.buffer_index = (self.buffer_index + 1) % flags.buffer_size
self.reward_sum += reward
def train(self):
flags = self.flags
if self.action_num % flags.train_int == 0 and self.buffer_count >= flags.start_train:
self.update_num += 1
for i in range(flags.batch_size):
while True:
ind = np.random.randint(flags.num_frame, self.buffer_count)
if self.finished_buffer[(ind-flags.num_frame):(ind-1)].any():
continue
if ind >= self.buffer_index and (ind - flags.num_frame) < self.buffer_index:
continue
break
self.current_states[i,:,:,:] = self.getState(ind-1, self.buffer_count, self.buffer_index, self.state_buffer)
self.new_states[i,:,:,:] = self.getState(ind, self.buffer_count, self.buffer_index, self.state_buffer)
self.sample_inds[i] = ind-1
rewards = self.reward_buffer[self.sample_inds]
actions = self.action_buffer[self.sample_inds]
finished = self.finished_buffer[self.sample_inds]
action_mask = np.zeros((flags.batch_size, flags.num_action))
action_mask[[range(flags.batch_size),actions]] = 1
feed_dict = {self.current_state_pl:np.transpose(self.current_states,(0,2,3,1)),
self.newstate_pl:np.transpose(self.new_states,(0,2,3,1)), self.rewards_pl:rewards,
self.action_mask_pl:action_mask, self.finished_pl:np.logical_not(finished)}
self.sess.run(self.apply_Qgrads, feed_dict=feed_dict)
if self.update_num % flags.tn_update_freq == 0:
self.updateTargetNetwork()
def recordProgress(self):
flags = self.flags
self.episode_num += 1
self.reward_sum += self.start_lives
self.reward_list.append(self.reward_sum)
self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
self.running_reward_list.append(self.running_reward)
print('episode_num: %d, action_num: %d, epsilon: %2.2f, resetting env. episode reward total was %f. running mean: %f' \
% (self.episode_num, self.action_num, self.epsilon, self.reward_sum, self.running_reward))
plt.figure(0)
plt.clf()
plt.plot(self.reward_list, label='per epsiode reward')
plt.plot(self.running_reward_list, label='average (100 eps)', color='r')
plt.xlabel('episode')
plt.ylabel('reward')
plt.legend(loc=2)
plt.savefig('Q_learning_performance_%slr_%sbias_large.png' % (flags.lr, flags.bias))
if self.episode_num % 100 == 0:
self.saver.save(self.sess, "/tmp/Qmodel.ckpt")
fh = open('/tmp/state_info.pkl', 'wb')
pickle.dump((self.epsilon, self.update_num, self.action_num,
self.reward_list, self.running_reward_list,
self.running_reward, self.episode_num), fh)
fh.close()