In [1]:
# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten, Conv2D  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves
from controler import Controller
import numpy as np
import gym                                # To train our network
env = Controller()          # Choose game (any in the gym should work)

import random     # For sampling batches from the observations


# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), input_shape=(60, 232, 3)))
# model.add(Dense(20, input_shape=(2,) + env.observation_space.shape, init='uniform', activation='relu'))
model.add(Flatten())       # Flatten input so as to have no problems with processing
model.add(Dense(128, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='linear'))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# FIRST STEP: Knowing what each action does (Observing)

observetime = 100                         # Number of timesteps we will be acting on the game and observing results
epsilon = 0.5                             # Probability of doing a random move
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 32                               # Learning minibatch size
num_episode = 10000

for episode in range(num_episode):
    env.reset()                     # Game begins
    observation, reward, done, _ = env.step(1, epsilon)
    # (Formatting issues) Making the observation the first element of a batch of inputs
    # obs = np.expand_dims(observation, axis=0)
    state = np.stack((observation, observation), axis=0)
    print(state)
    
    # Parameters
    D = deque()                                # Register where the actions will be stored

    for t in range(observetime):
        if np.random.rand() <= epsilon:
            action = np.random.randint(0, 1, size=1)[0]  # jump or not
        else:
            Q = model.predict(state)          # Q-values predictions
            print(Q)
            action = np.argmax(Q)             # Move with highest Q-value is the chosen one

        # See state of the game, reward... after performing the action
        observation_new, reward, done, info = env.step(action, epsilon)
        print('action: {}, observation shape: {}, reward: {}, done: {}'.format(action, observation_new.shape, reward, done))
        obs_new = np.expand_dims(observation_new, axis=0)  # (Formatting issues)

        # Update the input with the new state of the game
        state_new = np.append(obs_new, state[1:, :, :, :], axis=0)
        D.append((state, action, reward, state_new, done))         # 'Remember' action and consequence
        state = state_new         # Update state
        if done:
            env.reset()           # Restart game if it's finished

            # (Formatting issues) Making the observation the first element of a batch of inputs
            obs = np.expand_dims(observation, axis=0)
            state = np.stack((observation, observation), axis=0)
        print('Observing Finished Run New Episode')

    minibatch = random.sample(D, mb_size)  # Sample some moves

    inputs_shape = (mb_size,) + state.shape[1:]
    inputs = np.zeros(inputs_shape)
    targets = np.zeros((mb_size, 1))

    for i in range(0, mb_size):
        state = minibatch[i][0]
        action = minibatch[i][1]
        reward = minibatch[i][2]
        state_new = minibatch[i][3]
        done = minibatch[i][4]

        # Build Bellman equation for the Q function
        try:
            inputs[i:i+2] = np.expand_dims(state, axis=0)
            predict = model.predict(state)
            targets[i] = predict[0]
            targets[i+1] = predict[1]
            Q_sa = model.predict(state_new)

            if done:
                targets[i, action] = reward
            else:
                targets[i, action] = reward + gamma * np.max(Q_sa)

            # Train network to output the Q function
            model.train_on_batch(inputs, targets)
        except Exception as ex:
            print(ex)
    print('Learning Finished')
    model.save('saved_model.h5')

Using TensorFlow backend.


[[[[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  ...

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]]


 [[[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  ...

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]
   ...
   [1

action: 0, observation shape: (60, 232, 3), reward: 25, done: False
Observing Finished Run New Episode
[[0.02037815]
 [0.0129923 ]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 54, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 359, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 45, done: False
Observing Finished Run New Episode
[[0.03783924]
 [0.0129923 ]]
action: 0, observation shape: (60, 232, 3), reward: 45, done: True
Observing Finished Run New Episode
[[0.0129923]
 [0.0129923]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
[[0.02052002]
 [0.0129923 ]]
action: 0, observation shape: (60, 232, 3), reward: 12, done: F

action: 1, observation shape: (60, 232, 3), reward: 53, done: True
Observing Finished Run New Episode
[[162.16652]
 [162.16652]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 135, done: False
Observing Finished Run New Episode
[[162.64993]
 [162.16652]]
action: 0, observation shape: (60, 232, 3), reward: 18, done: False
Observing Finished Run New Episode
[[162.6461 ]
 [162.16652]]
action: 0, observation shape: (60, 232, 3), reward: 25, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
[[162.0303 ]
 [162.16652]]
action: 1, observation shape: (60, 232, 3), reward: 54, done: False
Observing Finished Run New Episode
[[161.50233]
 [162.16652]]
action: 1, observation shape: (60, 232, 3)

action: 1, observation shape: (60, 232, 3), reward: 44, done: False
Observing Finished Run New Episode
[[160.95287]
 [162.16652]]
action: 1, observation shape: (60, 232, 3), reward: 49, done: False
Observing Finished Run New Episode
[[161.11458]
 [162.16652]]
action: 1, observation shape: (60, 232, 3), reward: 53, done: False
Observing Finished Run New Episode
[[158.56808]
 [162.16652]]
action: 1, observation shape: (60, 232, 3), reward: 53, done: True
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
[[162.6921 ]
 [162.16652]]
action: 0, observation shape: (60, 232, 3), reward: 12, done: False
Observing Finished Run New Episode
[[162.68211]
 [162.16652]]
action: 0, observation shape: (60, 232, 3), reward: 18, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3),

action: 0, observation shape: (60, 232, 3), reward: 359, done: False
Observing Finished Run New Episode
[[194.26233]
 [194.90323]]
action: 1, observation shape: (60, 232, 3), reward: 44, done: False
Observing Finished Run New Episode
[[193.06369]
 [194.90323]]
action: 1, observation shape: (60, 232, 3), reward: 50, done: False
Observing Finished Run New Episode
[[193.89685]
 [194.90323]]
action: 1, observation shape: (60, 232, 3), reward: 53, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 53, done: True
Observing Finished Run New Episode
[[194.90323]
 [194.90323]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 8, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 14, done: False
Observing Finished Run New Episode
[[196.5668 ]
 [194.90323]]
action: 0, observation shape: (60, 232, 3)

action: 0, observation shape: (60, 232, 3), reward: 1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
[[216.51083]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: 8, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 135, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 18, done: False
Observing Finished Run New Episode
[[216.45015]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: 25, done: False
Observing Finished Run New Episode
[[216.45325]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: 28, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 35, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 359, done: False
Observing Finished Run New 

action: 0, observation shape: (60, 232, 3), reward: 45, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 45, done: True
Observing Finished Run New Episode
[[213.47005]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
[[216.32233]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 12, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 18, done: False
Observing Finished Run New Episode
[[216.41388]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
[[215.7933 ]
 [213.47005]]
action: 0, observation shape: (60, 232, 3), reward: 54, done: False
Ob

action: 0, observation shape: (60, 232, 3), reward: 45, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 45, done: True
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: -1, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
[[293.72342]
 [287.60983]]
action: 0, observation shape: (60, 232, 3), reward: 12, done: False
Observing Finished Run New Episode
[[293.75992]
 [287.60983]]
action: 0, observation shape: (60, 232, 3), reward: 17, done: False
Observing Finished Run New Episode
[[293.7733 ]
 [287.60983]]
action: 0, observation shape: (60, 232, 3), reward: 25, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 28, done: False
Observing Finished Run New Episode
[[293.7833 ]
 [287.60983]]
action: 0, observation shape: (60, 232, 3), reward: 35, done: False
Ob

action: 0, observation shape: (60, 232, 3), reward: 7, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 12, done: False
Observing Finished Run New Episode
[[384.8826 ]
 [375.87155]]
action: 0, observation shape: (60, 232, 3), reward: 18, done: False
Observing Finished Run New Episode
[[384.88455]
 [375.87155]]
action: 0, observation shape: (60, 232, 3), reward: 25, done: False
Observing Finished Run New Episode
[[384.86594]
 [375.87155]]
action: 0, observation shape: (60, 232, 3), reward: 28, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 35, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 359, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 45, done: False
Observing Finished Run New Episode
action: 0, observation shape: (60, 232, 3), reward: 45, done: True
Observing Finished Run New E

KeyboardInterrupt: 

In [None]:

# SECOND STEP: Learning from the observations (Experience replay)

def retrain_model(model_training):
    
    return model_training


In [None]:
# THIRD STEP: Play!

env.reset()
observation, reward, done, _ = env.step(1, epsilon)
state = np.stack((observation, observation), axis=0)
done = False
tot_reward = 0.0
while not done:
    Q = model.predict(state)
    action = np.argmax(Q)
    observation, reward, done, info = env.step(action, epsilon)
    obs = np.expand_dims(observation, axis=0)
    state = np.stack((observation, observation), axis=0)
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))
