In [1]:
 ###import libraries

import gym
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
###prepare environment
env = gym.make('CartPole-v1') #our environment is CartPole-v1

In [None]:
###understand environment

#Description: in a 2d setting, a cart has to run back and forth to balance a pole.

#Observation: 
#   Type: Box(4)
#    Num	Observation            Min            Max
#    0	Cart Position             -4.8            4.8
#    1	Cart Velocity             -Inf            Inf
#    2	Pole Angle                 -24 deg        24 deg
#    3	Pole Velocity At Tip      -Inf            Inf

#Actions:
#    Type: Discrete(2)
#    Num	Action
#    0	Push cart to the left
#    1	Push cart to the right

#Reward:
#    Reward is 1 for every step taken, including the termination step

#Starting State:
#    All observations are assigned a uniform random value in interval [-0.05, 0.05]

#Episode Termination:
#    Pole Angle is more than 12 degrees
#    Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
#    Episode length is greater than 200
#    Solved Requirements
#    Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [3]:
###test environment

env.reset() #initializes environment: initial state is shown below
for index in range(1000): #for each step (large number guarantees environment will eventually be complete)...
    env.render() #renders environment
    action = env.action_space.sample() #random action
    observation, reward, done, info = env.step(action) #env.step() executes action
    #the following is returned from each action
    #    observation/state
    #    reward
    #    whether environment is complete
    #    misc. info
    print('step \t\t{}'.format(index+1))
    if action == 0: print('action: \t{} (left)'.format(action))
    elif action == 1: print('action: \t{} (right)'.format(action))
    print('observation: \t{}'.format(observation))
    print('reward: \t{}'.format(reward))
    print('done: \t\t{}\n'.format(done))
#    print('info: \t\t{}'.format(info))
    if done: #if environment is complete, no next step will be taken
        break

step 		1
action: 	0 (left)
observation: 	[-0.01153163 -0.21668337  0.02333213  0.34048251]
reward: 	1.0
done: 		False

step 		2
action: 	1 (right)
observation: 	[-0.0158653  -0.02190103  0.03014178  0.05524755]
reward: 	1.0
done: 		False

step 		3
action: 	0 (left)
observation: 	[-0.01630332 -0.21744192  0.03124673  0.35728598]
reward: 	1.0
done: 		False

step 		4
action: 	0 (left)
observation: 	[-0.02065216 -0.41299384  0.03839245  0.65965573]
reward: 	1.0
done: 		False

step 		5
action: 	1 (right)
observation: 	[-0.02891204 -0.21842663  0.05158556  0.37930459]
reward: 	1.0
done: 		False

step 		6
action: 	1 (right)
observation: 	[-0.03328057 -0.02407376  0.05917165  0.1033225 ]
reward: 	1.0
done: 		False

step 		7
action: 	0 (left)
observation: 	[-0.03376204 -0.21999159  0.0612381   0.41407124]
reward: 	1.0
done: 		False

step 		8
action: 	1 (right)
observation: 	[-0.03816188 -0.02578867  0.06951953  0.14130566]
reward: 	1.0
done: 		False

step 		9
action: 	1 (right)
observation: 	[-

In [4]:
###prepare training data

env.reset()
goal_steps = 500 #highest number of steps possible
score_requirement = 60  #minimum score of each game so that its observations and actions will be stored in training data
initial_games = 10000 #number of games played to collect data

In [5]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(initial_games): #for each game played...
        score = 0 #(total) score starts at 0
        game_memory = [] #initialize game memory
        previous_observation = [] #initialize previous observation
        for index in range(goal_steps): #for each step...
            action = env.action_space.sample() #random action
            observation, reward, done, info = env.step(action) #execute action
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action]) #game_memory stores all observations and actions but the last ones
            
            previous_observation = observation
            score += reward #reward is added to the score
            if done: #if environment is complete, no next step will be taken
                break

        if score >= score_requirement: #if score is greater than or equal to the score requirement...
            accepted_scores.append(score)
            for data in game_memory: #observations and actions of this game are stored in the training data
                action = data[1]
                observation = data[0]
                if action == 0:
                    output = [1, 0] #hot encoding of variables: [left, right]
                elif action == 1:
                    output = [0, 1]
                training_data.append([observation, output])
        
        env.reset() #reset environment for next game

    print(accepted_scores)
    return training_data

training_data = model_data_preparation()
training_data

[61.0, 66.0, 75.0, 73.0, 65.0, 65.0, 69.0, 63.0, 82.0, 68.0, 61.0, 75.0, 60.0, 74.0, 79.0, 80.0, 65.0, 98.0, 67.0, 65.0, 74.0, 81.0, 61.0, 66.0, 81.0, 60.0, 65.0, 71.0, 65.0, 64.0, 75.0, 72.0, 62.0, 64.0, 85.0, 100.0, 62.0, 71.0, 63.0, 76.0, 62.0, 101.0, 60.0, 71.0, 61.0, 66.0, 68.0, 63.0, 133.0, 62.0, 64.0, 85.0, 62.0, 73.0, 74.0, 64.0, 66.0, 73.0, 61.0, 83.0, 62.0, 64.0, 79.0, 72.0, 81.0, 72.0, 104.0, 77.0, 66.0, 65.0, 101.0, 89.0, 73.0, 64.0, 70.0, 81.0, 83.0, 65.0, 63.0, 97.0, 76.0, 73.0, 75.0, 79.0, 74.0, 73.0, 62.0, 86.0, 62.0, 75.0, 73.0, 82.0, 118.0, 76.0, 64.0, 79.0, 60.0, 65.0, 69.0, 61.0, 64.0, 68.0, 63.0, 67.0, 66.0, 62.0, 89.0, 64.0, 62.0, 83.0, 71.0, 69.0, 61.0, 64.0, 89.0, 64.0, 72.0, 109.0, 68.0, 73.0, 92.0, 76.0, 69.0, 77.0, 62.0, 63.0, 60.0, 67.0, 65.0, 77.0, 60.0, 72.0, 70.0, 66.0, 61.0, 64.0, 84.0, 63.0, 62.0, 75.0, 74.0, 66.0, 63.0, 111.0, 62.0, 64.0, 72.0, 64.0, 65.0, 63.0]


[[array([ 0.01405773,  0.16559331,  0.03149199, -0.28841029]), [0, 1]],
 [array([ 0.01736959,  0.36025235,  0.02572379, -0.57099696]), [0, 1]],
 [array([ 0.02457464,  0.5550043 ,  0.01430385, -0.85546628]), [1, 0]],
 [array([ 0.03567473,  0.35969038, -0.00280548, -0.55832018]), [0, 1]],
 [array([ 0.04286853,  0.5548516 , -0.01397188, -0.85188567]), [0, 1]],
 [array([ 0.05396557,  0.75016122, -0.0310096 , -1.1489291 ]), [1, 0]],
 [array([ 0.06896879,  0.55545747, -0.05398818, -0.86612925]), [1, 0]],
 [array([ 0.08007794,  0.3611102 , -0.07131076, -0.59089791]), [1, 0]],
 [array([ 0.08730014,  0.1670553 , -0.08312872, -0.32150295]), [1, 0]],
 [array([ 0.09064125, -0.02679053, -0.08955878, -0.05614999]), [1, 0]],
 [array([ 0.09010544, -0.22052182, -0.09068178,  0.20698721]), [1, 0]],
 [array([ 0.085695  , -0.41423792, -0.08654203,  0.46974355]), [0, 1]],
 [array([ 0.07741025, -0.21800689, -0.07714716,  0.15108697]), [0, 1]],
 [array([ 0.07305011, -0.02186989, -0.07412542, -0.16490224]), [

In [6]:
###build model

def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(units=128, input_dim=input_size, activation='relu'))
    model.add(Dense(units=52, activation='relu'))
    model.add(Dense(units=output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [7]:
###train model

def train_model(training_data):
    X = np.array([i[0] for i in training_data])
    y = np.array([i[1] for i in training_data])
    model = build_model(input_size=len(X[0]), output_size=len(y[0])) #call build_model function to build model
    model.fit(X, y, epochs=10)
    return model

In [8]:
trained_model = train_model(training_data)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
