-
Notifications
You must be signed in to change notification settings - Fork 0
/
qlearn.py
191 lines (144 loc) · 5.83 KB
/
qlearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import json
import os
import random
import sys
import time
from collections import deque
from datetime import datetime, timedelta
import keras
import numpy as np
import skimage.color
import skimage.exposure
import skimage.transform
from keras import layers, models
from game import wrapped_flappy_bird as game
from util import logging
ACTIONS = 2 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
# TOTAL_OBSERVATION = 3_200 # timesteps to observe before training
TOTAL_EXPLORE = 30_000 # 3000000. # frames over which to anneal epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
REPLAY_MEMORY = 20_000 # 30000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
FRAME_PER_ACTION = 1
LEARNING_RATE = 1e-4
def init_network(observe, epsilon, mode, filename=None):
print("Now we init the network")
img_rows, img_cols = 80, 80
#Convert image into Black and white
img_channels = 4 # We stack 4 frames
network = models.Sequential()
network.add(layers.Conv2D(32, (8, 8), activation='relu', strides=(4, 4), padding='same',input_shape=(img_rows,img_cols,img_channels))) #80*80*4
network.add(layers.Conv2D(64, (4, 4), activation='relu', strides=(2, 2), padding='same'))
network.add(layers.Conv2D(64, (3, 3), activation='relu', strides=(1, 1), padding='same'))
network.add(layers.Flatten())
network.add(layers.Dense(512, activation='relu'))
network.add(layers.Dense(2))
network.compile(loss='mse',optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))
print("We finished init the network")
if mode == 'test':
observe = 999999999 # We keep observe, never train
epsilon = FINAL_EPSILON
print ("Now we load the model")
try:
network = keras.models.load_model(filename)
except (OSError, IOError) as e:
print(e)
exit()
print ("The model has been loaded successfully")
print("***** We are in testing mode *****")
else:
assert mode == 'train'
print("***** We are in training mode *****")
return network
def get_init_stack(game_state):
# get the first state by doing nothing and preprocess the image to 80x80x4
do_nothing = np.zeros(ACTIONS)
do_nothing[0] = 1
x_t_colored, _, _ = game_state.frame_step(do_nothing)
x_t = skimage.color.rgb2gray(x_t_colored)
x_t = skimage.transform.resize(x_t,(80,80))
x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
x_t = x_t / 255.0
s_t0 = np.stack((x_t, x_t, x_t, x_t), axis=2)
# print (s_t.shape)
# In Keras, need to reshape
s_t0 = s_t0.reshape(1, s_t0.shape[0], s_t0.shape[1], s_t0.shape[2]) #1*80*80*4
return s_t0
def get_next_stack(game_state, a_t, s_t0):
# run the selected action and observed next state and reward
x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
x_t1 = skimage.color.rgb2gray(x_t1_colored)
x_t1 = skimage.transform.resize(x_t1,(80,80))
x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))
x_t1 = x_t1 / 255.0
x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1
s_t1 = np.append(x_t1, s_t0[:, :, :, :3], axis=3)
return s_t1, r_t, terminal
def train_network(queue, network):
# sample a minibatch to train on
minibatch = random.sample(queue, BATCH)
# Now we do the experience replay
state_t, action_t, reward_t, state_t1, terminal = zip(*minibatch)
state_t = np.concatenate(state_t)
state_t1 = np.concatenate(state_t1)
targets = network.predict(state_t)
Q_sa = network.predict(state_t1)
targets[range(BATCH), action_t] = reward_t + GAMMA*np.max(Q_sa, axis=1)*np.invert(terminal)
loss = network.train_on_batch(state_t, targets)
return loss, Q_sa
def chose_action(network, s_t, a_t, t, epsilon):
#choose an action epsilon greedy
if t % FRAME_PER_ACTION == 0:
if random.random() <= epsilon:
print("----------Random Action----------")
action_index = random.randrange(ACTIONS)
else:
q = network.predict(s_t) #input a stack of 4 images, get the prediction
max_Q = np.argmax(q)
action_index = max_Q
else:
assert False
return action_index
def q_learning(mode, filename=None):
if mode == 'test':
TOTAL_OBSERVATION = 1_000
else:
TOTAL_OBSERVATION = 3_200
observe = TOTAL_OBSERVATION
epsilon = INITIAL_EPSILON
# init network
network = init_network(observe, epsilon, mode, filename)
# open up a game state to communicate with emulator
game_state = game.GameState()
# store the previous observations in replay memory
queue = deque(maxlen=REPLAY_MEMORY)
s_t0 = get_init_stack(game_state)
t = 0
time0 = time.time()
total_loss = 0
while (True):
action_index, r_t = 0, 0
a_t = np.zeros([ACTIONS])
action_index = chose_action(network, s_t0, a_t, t, epsilon)
a_t[action_index] = 1
# We reduced the epsilon gradually
if epsilon > FINAL_EPSILON and t > observe:
epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / TOTAL_EXPLORE
s_t1, r_t, terminal = get_next_stack(game_state, a_t, s_t0)
queue.append((s_t0, action_index, r_t, s_t1, terminal))
if t > observe:
# only train if done observing
loss, q_sa = train_network(queue, network)
else:
loss, q_sa = 0, 0
total_loss += loss
s_t0, t = s_t1, t + 1
logging(mode, t, time0, network, observe, epsilon, action_index, r_t, q_sa, loss, total_loss, TOTAL_EXPLORE)
print("Episode finished!")
print("************************")
if __name__ == '__main__':
# make TOTAL_OBSERVATION much smaller, and call train directly
TOTAL_OBSERVATION = 32
q_learning('train')