In [9]:
import numpy as np
import tensorflow as tf
import gym
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation, Dropout, Conv2D, MaxPooling2D
from keras.optimizers import Adam
from collections import deque

In [3]:
env = gym.make('CarRacing-v0')

[2017-07-01 23:47:59,179] Making new env: CarRacing-v0


In [19]:
env.reset()
for i in range(5000):
    # env.render()
    action = env.action_space.sample()
    state, reward, done, prob = env.step(action)
    print('frame:{} reward:{} done:{}'.format(i, reward, done))
    if done:
        break
env.close()

Track generation: 1091..1368 -> 277-tiles track
frame:0 reward:7.1463768115942035 done:False
frame:1 reward:-0.09999999999999964 done:False
frame:2 reward:-0.09999999999999964 done:False
frame:3 reward:-0.09999999999999964 done:False
frame:4 reward:-0.09999999999999964 done:False
frame:5 reward:-0.09999999999999964 done:False
frame:6 reward:-0.09999999999999964 done:False
frame:7 reward:-0.09999999999999964 done:False
frame:8 reward:-0.09999999999999964 done:False
frame:9 reward:-0.09999999999999964 done:False
frame:10 reward:-0.09999999999999964 done:False
frame:11 reward:-0.09999999999999964 done:False
frame:12 reward:-0.09999999999999964 done:False
frame:13 reward:-0.09999999999999964 done:False
frame:14 reward:-0.09999999999999964 done:False
frame:15 reward:3.523188405797102 done:False
frame:16 reward:-0.09999999999999964 done:False
frame:17 reward:-0.09999999999999964 done:False
frame:18 reward:-0.09999999999999964 done:False
frame:19 reward:-0.09999999999999964 done:False
frame:2

In [5]:
print(env.observation_space.shape)
print(env.action_space.shape)

(96, 96, 3)
(3,)


In [6]:
print(env.action_space.sample())

[-0.17207501  0.6296183   0.77858426]


In [46]:
# Hyper Params
dropout = 0.8
learning_rate = 0.001
memory_size = 1000

episodes = 10
time = 3000

gamma = 0.99
epsilon = 1
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 32

In [47]:
bot_racer = Sequential()

bot_racer.add(Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)))
bot_racer.add(MaxPooling2D(pool_size=(3,3)))
bot_racer.add(Dropout(dropout))

print(bot_racer.output.shape)

bot_racer.add(Conv2D(64, (3, 3), activation='relu'))
bot_racer.add(MaxPooling2D(pool_size=(3,3)))
bot_racer.add(Dropout(dropout))

print(bot_racer.output.shape)

bot_racer.add(Flatten())
bot_racer.add(Dense(64, activation='relu'))
bot_racer.add(Dropout(dropout))
bot_racer.add(Dense(3, activation='linear'))

bot_racer.compile(loss='mse', optimizer=Adam(learning_rate))

(?, 31, 31, 32)
(?, 9, 9, 64)


In [48]:
memory = deque(maxlen=memory_size)

In [49]:
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    for frame in range(time):
        if epsilon > np.random.rand():
            action = env.action_space.sample()
        else:
            action = bot_racer.predict(np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]]))
        
        action = action.reshape((3))
        next_state, reward, done, prob = env.step(action)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward = total_reward + reward
        
        if done:
            print('episode:{} frames:{} total rewards:{}'.format(episode + 1, frame + 1, total_reward))
            break
        
    if epsilon > epsilon_min:
        epsilon *= epsilon * epsilon_decay
        
    minibatch = [memory[ii] for ii in np.random.choice(range(len(memory)), batch_size)]

Track generation: 1148..1439 -> 291-tiles track
episode:1 frames:1000 total rewards:-31.03448275862107
Track generation: 1191..1496 -> 305-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1060..1329 -> 269-tiles track
episode:2 frames:1000 total rewards:-29.104477611940624
Track generation: 1197..1500 -> 303-tiles track
episode:3 frames:1000 total rewards:-33.77483443708658
Track generation: 1322..1656 -> 334-tiles track
episode:4 frames:1000 total rewards:-42.94294294294365
Track generation: 1067..1338 -> 271-tiles track
episode:5 frames:1000 total rewards:-33.33333333333373
Track generation: 1071..1343 -> 272-tiles track
episode:6 frames:1000 total rewards:-37.26937269372742
Track generation: 1199..1503 -> 304-tiles track
episode:7 frames:1000 total rewards:-50.4950495049513
Track generation: 1060..1329 -> 269-tiles track
episode:8 frames:1000 total rewards:-66.41791044776178
Track generation: 1210..1517 -> 307-tiles track
episode:

In [37]:
state = env.reset()
print(state.shape)
action = bot_racer.predict(np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]]))
print(action)

Track generation: 1055..1323 -> 268-tiles track
(96, 96, 3)
[[ 0.  0.  0.]]


In [50]:
len(memory)

1000