In [1]:
###import libraries

import gym
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
###prepare environment
env = gym.make('CartPole-v1') #our environment is CartPole-v1

In [None]:
###understand environment

#Description: in a 2d setting, a cart has to run back and forth to balance a pole.

#Observation: 
#   Type: Box(4)
#    Num	Observation            Min            Max
#    0	Cart Position             -4.8            4.8
#    1	Cart Velocity             -Inf            Inf
#    2	Pole Angle                 -24 deg        24 deg
#    3	Pole Velocity At Tip      -Inf            Inf

#Actions:
#    Type: Discrete(2)
#    Num	Action
#    0	Push cart to the left
#    1	Push cart to the right

#Reward:
#    Reward is 1 for every step taken, including the termination step

#Starting State:
#    All observations are assigned a uniform random value in interval [-0.05, 0.05]

#Episode Termination:
#    Pole Angle is more than 12 degrees
#    Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
#    Episode length is greater than 200
#    Solved Requirements
#    Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [3]:
###test environment

env.reset() #initializes environment: initial state is shown below
for index in range(1000): #for each step (large number guarantees environment will eventually be complete)...
    env.render() #renders environment
    action = env.action_space.sample() #random action
    observation, reward, done, info = env.step(action) #env.step() executes action
    #the following is returned from each action
    #    observation/state
    #    reward
    #    whether environment is complete
    #    misc. info
    print('step \t\t{}'.format(index+1))
    if action == 0: print('action: \t{} (left)'.format(action))
    elif action == 1: print('action: \t{} (right)'.format(action))
    print('observation: \t{}'.format(observation))
    print('reward: \t{}'.format(reward))
    print('done: \t\t{}\n'.format(done))
#    print('info: \t\t{}'.format(info))
    if done: #if environment is complete, no next step will be taken
        break

step 		1
action: 	0 (left)
observation: 	[ 0.03342328 -0.15057826 -0.01268253  0.32049639]
reward: 	1.0
done: 		False

step 		2
action: 	1 (right)
observation: 	[ 0.03041172  0.04472199 -0.00627261  0.02384101]
reward: 	1.0
done: 		False

step 		3
action: 	0 (left)
observation: 	[ 0.03130615 -0.15030944 -0.00579579  0.31453828]
reward: 	1.0
done: 		False

step 		4
action: 	1 (right)
observation: 	[0.02829997 0.04489459 0.00049498 0.0200332 ]
reward: 	1.0
done: 		False

step 		5
action: 	0 (left)
observation: 	[ 0.02919786 -0.15023446  0.00089564  0.31287225]
reward: 	1.0
done: 		False

step 		6
action: 	0 (left)
observation: 	[ 0.02619317 -0.34536916  0.00715309  0.6058375 ]
reward: 	1.0
done: 		False

step 		7
action: 	1 (right)
observation: 	[ 0.01928579 -0.15034796  0.01926984  0.31541618]
reward: 	1.0
done: 		False

step 		8
action: 	0 (left)
observation: 	[ 0.01627883 -0.34573902  0.02557816  0.61411334]
reward: 	1.0
done: 		False

step 		9
action: 	0 (left)
observation: 	[ 0.0093

In [4]:
env.reset()
goal_steps = 500 #highest number of steps possible
score_requirement = 60  #minimum score of each game so that its observations and actions will be stored in training data
initial_games = 10000 #number of games played to collect data

In [5]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(initial_games): #for each game played...
        score = 0 #(total) score starts at 0
        game_memory = [] #initialize game memory
        previous_observation = [] #initialize previous observation
        for index in range(goal_steps): #for each step...
            action = env.action_space.sample() #random action
            observation, reward, done, info = env.step(action) #execute action
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action]) #game_memory stores all observations and actions but the last ones
            
            previous_observation = observation
            score += reward #reward is added to the score
            if done: #if environment is complete, no next step will be taken
                break

        if score >= score_requirement: #if score is greater than or equal to the score requirement...
            accepted_scores.append(score)
            for data in game_memory: #observations and actions of this game are stored in the training data
                action = data[1]
                observation = data[0]
                if action == 0:
                    output = [1, 0] #what is output in this context?
                elif action == 1:
                    output = [0, 1]
                training_data.append([observation, output])
        
        env.reset() #reset environment for next game

    print(accepted_scores)
    return training_data

model_data_preparation()

[61.0, 63.0, 88.0, 82.0, 68.0, 64.0, 68.0, 66.0, 82.0, 77.0, 60.0, 64.0, 68.0, 74.0, 68.0, 66.0, 67.0, 82.0, 60.0, 72.0, 61.0, 67.0, 62.0, 67.0, 69.0, 78.0, 65.0, 68.0, 60.0, 60.0, 67.0, 64.0, 72.0, 61.0, 63.0, 60.0, 86.0, 61.0, 77.0, 60.0, 81.0, 75.0, 73.0, 78.0, 82.0, 64.0, 72.0, 109.0, 67.0, 78.0, 85.0, 61.0, 91.0, 63.0, 64.0, 67.0, 61.0, 64.0, 89.0, 62.0, 97.0, 68.0, 102.0, 80.0, 77.0, 91.0, 62.0, 76.0, 80.0, 101.0, 69.0, 78.0, 77.0, 65.0, 63.0, 62.0, 65.0, 89.0, 63.0, 60.0, 67.0, 65.0, 76.0, 75.0, 65.0, 79.0, 68.0, 60.0, 78.0, 67.0, 68.0, 81.0, 60.0, 74.0, 62.0, 65.0, 75.0, 103.0, 89.0, 63.0, 60.0, 67.0, 70.0, 68.0, 61.0, 69.0, 80.0, 70.0, 75.0, 91.0, 60.0, 86.0, 72.0, 66.0, 60.0, 62.0, 69.0, 61.0, 76.0, 80.0, 63.0, 63.0, 97.0, 61.0, 63.0, 70.0, 69.0, 65.0, 72.0, 98.0, 61.0, 62.0, 63.0, 61.0, 61.0, 73.0, 60.0, 70.0, 80.0, 61.0, 89.0, 66.0, 109.0, 61.0, 64.0, 71.0, 60.0, 83.0, 63.0, 97.0, 62.0, 60.0]


[[array([-0.01216641,  0.18812251,  0.02361646, -0.25773914]), [0, 1]],
 [array([-0.00840396,  0.38289948,  0.01846168, -0.54288052]), [1, 0]],
 [array([-0.00074597,  0.187523  ,  0.00760407, -0.24443832]), [1, 0]],
 [array([ 0.00300449, -0.00770673,  0.0027153 ,  0.05063339]), [1, 0]],
 [array([ 0.00285036, -0.20286751,  0.00372797,  0.34417178]), [1, 0]],
 [array([-0.00120699, -0.3980423 ,  0.01061141,  0.63802795]), [0, 1]],
 [array([-0.00916784, -0.20306991,  0.02337197,  0.34870553]), [1, 0]],
 [array([-0.01322924, -0.39851635,  0.03034608,  0.64866584]), [0, 1]],
 [array([-0.02119956, -0.20382999,  0.04331939,  0.36569114]), [0, 1]],
 [array([-0.02527616, -0.00934957,  0.05063322,  0.0869761 ]), [1, 0]],
 [array([-0.02546316, -0.20515936,  0.05237274,  0.39519417]), [0, 1]],
 [array([-0.02956634, -0.01081813,  0.06027662,  0.11947257]), [0, 1]],
 [array([-0.0297827 ,  0.18339073,  0.06266607, -0.15360107]), [1, 0]],
 [array([-0.02611489, -0.01256992,  0.05959405,  0.1581747 ]), [