In [1]:
 ###import libraries

import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
###prepare environment
env = gym.make('CartPole-v1') #our environment is CartPole-v1
env.reset() #initialize environment: initial state is shown below

array([ 0.03633624, -0.03525033,  0.02439144,  0.03009817])

In [None]:
###understand environment

#Description: in a 2d setting, a cart has to run back and forth to balance a pole.

#Observation: 
#   Type: Box(4)
#    Num	Observation            Min            Max
#    0	Cart Position             -4.8            4.8
#    1	Cart Velocity             -Inf            Inf
#    2	Pole Angle                 -24 deg        24 deg
#    3	Pole Velocity At Tip      -Inf            Inf

#Actions:
#    Type: Discrete(2)
#    Num	Action
#    0	Push cart to the left
#    1	Push cart to the right

#Reward:
#    Reward is 1 for every step taken, including the termination step

#Starting State:
#    All observations are assigned a uniform random value in interval [-0.05, 0.05]

#Episode Termination:
#    Pole Angle is more than 12 degrees
#    Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
#    Episode length is greater than 200
#    Solved Requirements
#    Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [3]:
###test environment

for step in range(1000): #for each step (large number guarantees environment will eventually be complete)...
    #env.render() #render environment
    action = env.action_space.sample() #random action
    observation, reward, done, info = env.step(action) #env.step() executes action
    #the following is returned from each action
    #    observation/state
    #    reward
    #    whether environment is complete
    #    misc. info
    print('step \t\t{}'.format(step+1))
    if action == 0: print('action: \t{} (left)'.format(action))
    elif action == 1: print('action: \t{} (right)'.format(action))
    print('observation: \t{}'.format(observation))
    print('reward: \t{}'.format(reward))
    print('done: \t\t{}\n'.format(done))
    #print('info: \t\t{}'.format(info))
    if done: #if environment is complete, no next step will be taken
        break

step 		1
action: 	1 (right)
observation: 	[ 0.03563124  0.15951349  0.0249934  -0.25479029]
reward: 	1.0
done: 		False

step 		2
action: 	0 (left)
observation: 	[ 0.03882151 -0.03595623  0.0198976   0.04567002]
reward: 	1.0
done: 		False

step 		3
action: 	1 (right)
observation: 	[ 0.03810238  0.15887483  0.020811   -0.24066921]
reward: 	1.0
done: 		False

step 		4
action: 	0 (left)
observation: 	[ 0.04127988 -0.03653813  0.01599761  0.05850474]
reward: 	1.0
done: 		False

step 		5
action: 	0 (left)
observation: 	[ 0.04054911 -0.23188576  0.01716771  0.35619179]
reward: 	1.0
done: 		False

step 		6
action: 	0 (left)
observation: 	[ 0.0359114  -0.42724753  0.02429155  0.65423833]
reward: 	1.0
done: 		False

step 		7
action: 	1 (right)
observation: 	[ 0.02736645 -0.23247209  0.03737631  0.36930237]
reward: 	1.0
done: 		False

step 		8
action: 	1 (right)
observation: 	[ 0.02271701 -0.03790058  0.04476236  0.08863502]
reward: 	1.0
done: 		False

step 		9
action: 	0 (left)
observation: 	[ 0

In [4]:
###prepare training data

env.reset()
goal_steps = 500 #highest number of steps possible
score_requirement = 60  #minimum score of each game so that its observations and actions will be stored in training data
initial_games = 10000 #number of games played to collect data

In [5]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game in range(initial_games): #for each game played...
        score = 0 #(total) score
        game_memory = [] #memory of each game
        previous_observation = [] #previous observations
        for step in range(goal_steps): #for each step...
            action = env.action_space.sample() #random action
            observation, reward, done, info = env.step(action) #execute action
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action]) #game_memory stores all observations and actions but the last ones
            
            previous_observation = observation
            score += reward #reward is added to the score
            if done: #if environment is complete, no next step will be taken
                break

        if score >= score_requirement: #if score is greater than or equal to the score requirement...
            accepted_scores.append(score)
            for data in game_memory: #observations and actions of this game are stored in the training data
                action = data[1]
                observation = data[0]
                if action == 0:
                    output = [1, 0] #hot encoding of variables: [left, right]
                elif action == 1:
                    output = [0, 1]
                training_data.append([observation, output])
        
        env.reset() #reset environment for next game

    print(accepted_scores)
    return training_data

In [6]:
training_data = model_data_preparation()
training_data

[63.0, 71.0, 61.0, 66.0, 61.0, 63.0, 62.0, 82.0, 61.0, 62.0, 91.0, 61.0, 90.0, 61.0, 64.0, 60.0, 78.0, 66.0, 61.0, 68.0, 81.0, 60.0, 71.0, 80.0, 60.0, 62.0, 61.0, 70.0, 105.0, 99.0, 66.0, 75.0, 61.0, 62.0, 72.0, 108.0, 61.0, 61.0, 64.0, 71.0, 61.0, 85.0, 61.0, 64.0, 71.0, 60.0, 68.0, 94.0, 71.0, 60.0, 84.0, 64.0, 61.0, 60.0, 70.0, 65.0, 78.0, 69.0, 81.0, 71.0, 66.0, 73.0, 67.0, 71.0, 61.0, 64.0, 65.0, 61.0, 64.0, 90.0, 115.0, 67.0, 61.0, 61.0, 70.0, 67.0, 73.0, 74.0, 76.0, 60.0, 77.0, 79.0, 69.0, 95.0, 63.0, 87.0, 60.0, 63.0, 61.0, 98.0, 76.0, 61.0, 74.0, 67.0, 63.0, 63.0, 60.0, 68.0, 80.0, 60.0, 68.0, 73.0, 62.0, 66.0, 60.0, 68.0, 72.0, 62.0, 103.0, 61.0, 68.0, 70.0, 71.0, 63.0, 75.0, 78.0, 66.0, 70.0, 75.0, 61.0, 75.0, 67.0, 65.0, 76.0, 64.0, 61.0, 80.0, 67.0, 70.0, 64.0, 67.0, 65.0, 65.0, 73.0, 65.0, 96.0, 70.0, 60.0, 75.0, 68.0, 70.0, 62.0, 72.0, 73.0, 74.0, 63.0, 100.0, 61.0, 63.0, 63.0, 91.0, 92.0, 81.0, 105.0, 61.0, 64.0, 81.0]


[[array([ 0.02595027,  0.23584875,  0.03266933, -0.28889652]), [0, 1]],
 [array([ 0.03066725,  0.43048998,  0.0268914 , -0.57109949]), [1, 0]],
 [array([ 0.03927705,  0.23500144,  0.01546941, -0.2700676 ]), [1, 0]],
 [array([0.04397708, 0.0396622 , 0.01006806, 0.02745404]), [0, 1]],
 [array([ 0.04477032,  0.23463833,  0.01061714, -0.26203537]), [1, 0]],
 [array([0.04946309, 0.03936644, 0.00537644, 0.03397733]), [1, 0]],
 [array([ 0.05025041, -0.15583219,  0.00605598,  0.32835171]), [0, 1]],
 [array([0.04713377, 0.03920302, 0.01262302, 0.0375847 ]), [0, 1]],
 [array([ 0.04791783,  0.23414171,  0.01337471, -0.25108896]), [0, 1]],
 [array([ 0.05260067,  0.42907014,  0.00835293, -0.53952341]), [1, 0]],
 [array([ 0.06118207,  0.23383177, -0.00243754, -0.24422033]), [1, 0]],
 [array([ 0.0658587 ,  0.03874472, -0.00732194,  0.04769274]), [1, 0]],
 [array([ 0.0666336 , -0.15627148, -0.00636809,  0.33805656]), [0, 1]],
 [array([0.06350817, 0.03894051, 0.00039304, 0.04337231]), [0, 1]],
 [array(

In [7]:
###build model

def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(units=128, input_dim=input_size, activation='relu'))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [8]:
###train model

def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0])) #call build_model function to build model
    model.fit(X, y, epochs=10)
    return model

In [9]:
trained_model = train_model(training_data)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
###test model

scores = []
env.reset()
for game in range(100):
    score = 0
    previous_observation = []
    for step in range(goal_steps):
        #env.render()
        if len(previous_observation) == 0:
            action = env.action_space.sample() #first action is random
        else:
            action = np.argmax(trained_model.predict(previous_observation.reshape(-1, len(previous_observation)))[0])
            #rest of the actions are chosen by the trained model
        
        observation, reward, done, info = env.step(action)
        previous_observation = observation
        score += reward
        if done:
            break
    
    env.reset()
    scores.append(score)
    
print(scores)
print('average score: {}'.format(sum(scores)/len(scores)))

[500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 448.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 440.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0]
average score: 498.88
