In [1]:
###import libraries

import gym
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
###prepare environment
env = gym.make('CartPole-v1') #our environment is CartPole-v1
env.reset() #initialize environment: initial state is shown below

array([ 0.032902  , -0.04035456, -0.0195682 , -0.0346325 ])

In [None]:
###understand environment

#Description: in a 2d setting, a cart has to run back and forth to balance a pole.

#Observation: 
#   Type: Box(4)
#    Num	Observation            Min            Max
#    0	Cart Position             -4.8            4.8
#    1	Cart Velocity             -Inf            Inf
#    2	Pole Angle                 -24 deg        24 deg
#    3	Pole Velocity At Tip      -Inf            Inf

#Actions:
#    Type: Discrete(2)
#    Num	Action
#    0	Push cart to the left
#    1	Push cart to the right

#Reward:
#    Reward is 1 for every step taken, including the termination step

#Starting State:
#    All observations are assigned a uniform random value in interval [-0.05, 0.05]

#Episode Termination:
#    Pole Angle is more than 12 degrees
#    Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
#    Episode length is greater than 200
#    Solved Requirements
#    Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [3]:
###test environment

for step in range(1000): #for each step (large number guarantees environment will eventually be complete)...
    #env.render() #render environment
    action = env.action_space.sample() #random action
    observation, reward, done, info = env.step(action) #env.step() executes action
    #the following is returned from each action
    #    observation/state
    #    reward
    #    whether environment is complete
    #    misc. info
    print('step \t\t{}'.format(step+1))
    if action == 0: print('action: \t{} (left)'.format(action))
    elif action == 1: print('action: \t{} (right)'.format(action))
    print('observation: \t{}'.format(observation))
    print('reward: \t{}'.format(reward))
    print('done: \t\t{}\n'.format(done))
    #print('info: \t\t{}'.format(info))
    if done: #if environment is complete, no next step will be taken
        break

step 		1
action: 	0 (left)
observation: 	[ 0.03209491 -0.23519051 -0.02026085  0.25181279]
reward: 	1.0
done: 		False

step 		2
action: 	0 (left)
observation: 	[ 0.0273911  -0.43001738 -0.01522459  0.53803682]
reward: 	1.0
done: 		False

step 		3
action: 	0 (left)
observation: 	[ 0.01879075 -0.62492201 -0.00446386  0.82588404]
reward: 	1.0
done: 		False

step 		4
action: 	0 (left)
observation: 	[ 0.00629231 -0.81998263  0.01205382  1.11715968]
reward: 	1.0
done: 		False

step 		5
action: 	1 (right)
observation: 	[-0.01010734 -0.62502093  0.03439702  0.82828211]
reward: 	1.0
done: 		False

step 		6
action: 	0 (left)
observation: 	[-0.02260776 -0.82059585  0.05096266  1.13158169]
reward: 	1.0
done: 		False

step 		7
action: 	1 (right)
observation: 	[-0.03901968 -0.62617679  0.07359429  0.85530827]
reward: 	1.0
done: 		False

step 		8
action: 	0 (left)
observation: 	[-0.05154321 -0.82222036  0.09070046  1.17019483]
reward: 	1.0
done: 		False

step 		9
action: 	0 (left)
observation: 	[-0.0

In [4]:
###prepare training data

env.reset()
goal_steps = 500 #highest number of steps possible
score_requirement = 60  #minimum score of each game so that its observations and actions will be stored in training data
initial_games = 10000 #number of games played to collect data

In [5]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game in range(initial_games): #for each game played...
        score = 0 #(total) score
        game_memory = [] #memory of each game
        previous_observation = [] #previous observations
        for step in range(goal_steps): #for each step...
            action = env.action_space.sample() #random action
            observation, reward, done, info = env.step(action) #execute action
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action]) #game_memory stores all observations and actions but the last ones
            
            previous_observation = observation
            score += reward #reward is added to the score
            if done: #if environment is complete, no next step will be taken
                break

        if score >= score_requirement: #if score is greater than or equal to the score requirement...
            accepted_scores.append(score)
            for data in game_memory: #observations and actions of this game are stored in the training data
                action = data[1]
                observation = data[0]
                if action == 0:
                    output = [1, 0] #hot encoding of variables: [left, right]
                elif action == 1:
                    output = [0, 1]
                training_data.append([observation, output])
        
        env.reset() #reset environment for next game

    print(accepted_scores)
    return training_data

In [6]:
training_data = model_data_preparation()
training_data

[66.0, 72.0, 64.0, 61.0, 75.0, 61.0, 143.0, 72.0, 80.0, 65.0, 63.0, 89.0, 77.0, 60.0, 81.0, 68.0, 64.0, 65.0, 68.0, 82.0, 62.0, 63.0, 60.0, 65.0, 81.0, 80.0, 60.0, 70.0, 62.0, 62.0, 62.0, 69.0, 64.0, 79.0, 66.0, 71.0, 66.0, 71.0, 70.0, 63.0, 63.0, 67.0, 62.0, 94.0, 70.0, 69.0, 77.0, 64.0, 75.0, 66.0, 75.0, 65.0, 62.0, 61.0, 90.0, 60.0, 61.0, 60.0, 67.0, 60.0, 71.0, 61.0, 65.0, 67.0, 117.0, 72.0, 74.0, 85.0, 61.0, 62.0, 62.0, 66.0, 65.0, 73.0, 61.0, 64.0, 65.0, 64.0, 61.0, 70.0, 67.0, 70.0, 75.0, 72.0, 60.0, 60.0, 75.0, 78.0, 60.0, 86.0, 64.0, 74.0, 60.0, 63.0, 67.0, 61.0, 75.0, 61.0, 72.0, 65.0, 72.0, 72.0, 121.0, 69.0, 74.0, 73.0, 71.0, 67.0, 79.0, 70.0, 94.0, 98.0, 116.0, 69.0, 66.0, 62.0, 69.0, 74.0, 62.0, 68.0, 77.0, 73.0, 63.0, 76.0, 68.0, 61.0, 62.0, 75.0, 72.0, 71.0, 70.0, 63.0, 95.0, 60.0, 69.0, 101.0, 70.0, 67.0, 71.0, 60.0, 74.0, 69.0, 68.0, 63.0, 60.0, 60.0, 93.0, 78.0, 60.0, 73.0, 82.0, 69.0, 69.0, 64.0, 60.0, 63.0, 69.0, 81.0, 73.0, 61.0, 79.0, 64.0, 69.0, 61.0, 68.0, 64.0

[[array([-0.02954986, -0.15282497,  0.01923557,  0.3043654 ]), [0, 1]],
 [array([-0.03260636,  0.04201765,  0.02532288,  0.01781045]), [0, 1]],
 [array([-0.03176601,  0.23676746,  0.02567909, -0.26677648]), [0, 1]],
 [array([-0.02703066,  0.43151368,  0.02034356, -0.55125069]), [1, 0]],
 [array([-0.01840039,  0.236112  ,  0.00931855, -0.25222822]), [0, 1]],
 [array([-0.01367815,  0.43109966,  0.00427398, -0.5419574 ]), [1, 0]],
 [array([-0.00505615,  0.2359179 , -0.00656517, -0.24793088]), [0, 1]],
 [array([-3.37794199e-04,  4.31132993e-01, -1.15237846e-02, -5.42677360e-01]),
  [0, 1]],
 [array([ 0.00828487,  0.62641499, -0.02237733, -0.83896882]), [1, 0]],
 [array([ 0.02081317,  0.43160563, -0.03915671, -0.55340632]), [0, 1]],
 [array([ 0.02944528,  0.62725493, -0.05022483, -0.85816446]), [1, 0]],
 [array([ 0.04199038,  0.43285185, -0.06738812, -0.58168744]), [1, 0]],
 [array([ 0.05064741,  0.23873559, -0.07902187, -0.31097106]), [0, 1]],
 [array([ 0.05542213,  0.43488924, -0.08524129

In [7]:
###build model

def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(units=128, input_dim=input_size, activation='relu'))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [8]:
###train model

def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0])) #call build_model function to build model
    model.fit(X, y, epochs=10)
    return model

In [9]:
#trained_model = train_model(training_data)
#trained_model.save('model.h5')
#del trained_model
trained_model = load_model('model.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [10]:
###test model

scores = []
env.reset()
for game in range(100):
    score = 0
    previous_observation = []
    for step in range(goal_steps):
        #env.render()
        if len(previous_observation) == 0:
            action = env.action_space.sample() #first action is random
        else:
            action = np.argmax(trained_model.predict(previous_observation.reshape(-1, len(previous_observation)))[0])
            #rest of the actions are chosen by the trained model
        
        observation, reward, done, info = env.step(action)
        previous_observation = observation
        score += reward
        if done:
            break
    
    env.reset()
    scores.append(score)
    
print(scores)
print('average score: {}'.format(sum(scores)/len(scores)))

[500.0, 496.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 491.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 470.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0]
average score: 499.57
