In [1]:
import numpy as np
import tensorflow as tf
import gym
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation, Dropout, Conv2D, MaxPooling2D
from keras.optimizers import Adam
from collections import deque

Using TensorFlow backend.


In [2]:
env = gym.make('CarRacing-v0')

[2017-07-04 23:55:00,354] Making new env: CarRacing-v0


In [3]:
env.reset()
for i in range(50):
    # env.render()
    action = env.action_space.sample()
    state, reward, done, prob = env.step(action)
    print('frame:{} reward:{} done:{}'.format(i, reward, done))
    if done:
        break
env.close()

Track generation: 913..1153 -> 240-tiles track
frame:0 reward:8.268200836820084 done:False
frame:1 reward:-0.09999999999999964 done:False
frame:2 reward:-0.09999999999999964 done:False
frame:3 reward:-0.09999999999999964 done:False
frame:4 reward:-0.09999999999999964 done:False
frame:5 reward:-0.09999999999999964 done:False
frame:6 reward:-0.09999999999999964 done:False
frame:7 reward:-0.09999999999999964 done:False
frame:8 reward:-0.09999999999999964 done:False
frame:9 reward:-0.09999999999999964 done:False
frame:10 reward:-0.09999999999999964 done:False
frame:11 reward:-0.09999999999999964 done:False
frame:12 reward:-0.09999999999999964 done:False
frame:13 reward:-0.09999999999999964 done:False
frame:14 reward:-0.09999999999999964 done:False
frame:15 reward:-0.09999999999999964 done:False
frame:16 reward:-0.09999999999999964 done:False
frame:17 reward:-0.09999999999999964 done:False
frame:18 reward:-0.09999999999999964 done:False
frame:19 reward:-0.09999999999999964 done:False
frame:

In [3]:
print(env.observation_space.shape)
print(env.action_space.shape)

(96, 96, 3)
(3,)


In [4]:
print(env.action_space.sample())

[ 0.09762701  0.71518937  0.60276338]


In [5]:
# Hyper Params
dropout = 0.8
learning_rate = 0.001
memory_size = 1000

episodes = 10
time = 3000

gamma = 0.99
epsilon = 1
epsilon_min = 0.01
epsilon_decay = 0.999
batch_size = 32

In [6]:
bot_racer = Sequential()

bot_racer.add(Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)))
bot_racer.add(MaxPooling2D(pool_size=(3,3)))
bot_racer.add(Dropout(dropout))

print(bot_racer.output.shape)

bot_racer.add(Conv2D(64, (3, 3), activation='relu'))
bot_racer.add(MaxPooling2D(pool_size=(3,3)))
bot_racer.add(Dropout(dropout))

print(bot_racer.output.shape)

bot_racer.add(Flatten())
bot_racer.add(Dense(64, activation='relu'))
bot_racer.add(Dropout(dropout))
bot_racer.add(Dense(3, activation='sigmoid'))

bot_racer.compile(loss='mse', optimizer=Adam(learning_rate))

(?, 31, 31, 32)
(?, 9, 9, 64)


In [7]:
memory = deque(maxlen=memory_size)

In [8]:
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    state = state/255
    for frame in range(time):
        if epsilon > np.random.rand():
            action = env.action_space.sample()
        else:
            action = bot_racer.predict(np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]]))
        
        action = action.reshape((3))
        next_state, reward, done, prob = env.step(action)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        state = state/255
        total_reward = total_reward + reward
        
        if done:
            print('episode:{} frames:{} total rewards:{} epsilon:{}'.format(episode + 1, frame + 1, total_reward, epsilon))
            break
        
    if epsilon > epsilon_min:
        epsilon *= epsilon * epsilon_decay
        
    minibatch = [memory[ii] for ii in np.random.choice(range(len(memory)), batch_size)]
    for start, action, reward, next_state, done in minibatch:
        Q_target = reward + gamma * bot_racer.predict(np.reshape(next_state, [1, state.shape[0], state.shape[1], state.shape[2]]))
        bot_racer.fit(np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]]), Q_target, epochs=1, verbose=0)

Track generation: 1262..1578 -> 316-tiles track
episode:1 frames:1000 total rewards:-39.68253968254033 epsilon:1
Track generation: 1171..1468 -> 297-tiles track
episode:2 frames:1000 total rewards:-29.054054054054426 epsilon:0.999
Track generation: 1228..1539 -> 311-tiles track
episode:3 frames:1000 total rewards:-35.48387096774241 epsilon:0.997002999
Track generation: 1136..1424 -> 288-tiles track
episode:4 frames:1000 total rewards:-33.79790940766605 epsilon:0.993020965034979
Track generation: 1177..1475 -> 298-tiles track
episode:5 frames:1000 total rewards:-39.393939393939995 epsilon:0.9851045463620021
Track generation: 1472..1843 -> 371-tiles track
episode:6 frames:1000 total rewards:-45.94594594594669 epsilon:0.9694605362958227
Track generation: 1217..1535 -> 318-tiles track
episode:7 frames:1000 total rewards:-43.21766561514266 epsilon:0.9389138777035492
Track generation: 1127..1413 -> 286-tiles track
episode:8 frames:1000 total rewards:-40.350877192983106 epsilon:0.880677710474

In [9]:
state = env.reset()
print(state.shape)
action = bot_racer.predict(np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]]))
print(action)

Track generation: 1060..1329 -> 269-tiles track
(96, 96, 3)
[[ 0.39381853  0.61909968  0.60351008]]
