In [196]:
%reset
'''
From OpenAI gym.
https://gym.openai.com/envs/CartPole-v1/

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. 
The system is controlled by applying a force of +1 or -1 to the cart. 
The pendulum starts upright, and the goal is to prevent it from falling over.
A reward of +1 is provided for every timestep that the pole remains upright. 
The episode ends when the pole is more than 15 degrees from vertical, 
or the cart moves more than 2.4 units from the center.
'''

'\nFrom OpenAI gym.\nhttps://gym.openai.com/envs/CartPole-v1/\n\nA pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. \nThe system is controlled by applying a force of +1 or -1 to the cart. \nThe pendulum starts upright, and the goal is to prevent it from falling over.\nA reward of +1 is provided for every timestep that the pole remains upright. \nThe episode ends when the pole is more than 15 degrees from vertical, \nor the cart moves more than 2.4 units from the center.\n\nSource: https://keon.io/deep-q-learning/\n'

In [197]:
import numpy as np
#import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import gym
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

In [200]:
env = gym.make('CartPole-v1')

EPISODES = 10

state_initial = env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [181]:
action = np.random.random_integers(low=0, high=1)
env.reset()
next_state, reward, done, _ = env.step(action)
print(next_state)
print(reward)
print(done)

[ 0.01337734 -0.1915353   0.03364685  0.31667388]
1.0
False


In [87]:
state_size = env.observation_space.shape[0]  # angle of the pole and position of the cart
action_size = env.action_space.n  # 0 or 1 to the cart, pushing it left or right.

In [195]:
# ------------------------------------
LEARNING_RATE = 0.001
# ------------------------------------

def build_model(state_size, action_size):
    # Neural Net for Deep-Q learning.
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=(state_size,)))
    model.add(tf.keras.layers.Dense(24, activation='relu'))
    model.add(tf.keras.layers.Dense(24, activation='relu'))
    model.add(tf.keras.layers.Dense(action_size, activation='linear'))
    
    model.compile(
        loss='mse',
        metrics=['mae'],
        optimizer=tf.train.AdamOptimizer(learning_rate=LEARNING_RATE))
    return model

model = build_model(state_size, action_size)

In [194]:
next_state.shape

(4,)

In [19]:
"""
Solves the cartpole-v1 enviroment on OpenAI gym using policy search
Same algorithm as for cartpole-v0
A neural network is used to store the policy
At the end of each episode the target value for each taken action is
updated with the total normalized reward (up to a learning rate)
Then a standard supervised learning backprop on the entire batch is
executed
"""

import numpy as np
import numpy.matlib 

import gym
from gym import wrappers

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils

#initialize neural network to store policy
ActorNet = Sequential()
ActorNet.add(Dense(200,init='he_normal',input_dim=4,activation='relu'))
ActorNet.add(Dense(200,init='he_normal',activation='relu'))
ActorNet.add(Dense(2,init='he_normal',activation='sigmoid'))
ActorNet.compile(loss='mse',optimizer='RMSprop',metrics=['mae'])

NumEpisodes = 300

#load environment
env = gym.make('CartPole-v1')
env = gym.wrappers.Monitor(env, 'monitor')

TotalReward = 0
BufferSize = 0
eps = 1

#start learning
for episode in range(NumEpisodes):

    #initial state
    observation = env.reset() #observe initial state

    States = []
    ActionValues = []
    Actions = []

    t = 0
    loss = 0
    EpisodeReward = 0

    #decrease epsilon after each episode
    eps -= 0.01
    if eps<0:
        eps = 0

    while True:
        
        #show graphical environment
        #env.render()

        #evaluate NN to find action probabilities for current state

        #normalize inputs
        observation[0] /= 2.5
        observation[1] /= 2.5
        observation[2] /= 0.2
        observation[3] /= 2.5

        ActionValue = ActorNet.predict(observation.reshape(1,4),verbose=0).reshape(2,)

        #select best action eps-greedy with decay
        greedy = np.random.random()
        if greedy < eps:
            Action = np.random.randint(2)
        else:
            Action = np.argmax(ActionValue)
        
        #execute action
        observation_new, reward, done, info = env.step(Action)

        #normalize reward, maximum reward per episode is 500
        reward /= 500.0

        EpisodeReward += reward
        
        #save current movement in memory to assign rewards at end of episode
        States.append(observation)
        ActionValues.append(ActionValue)
        Actions.append(Action)

        #update state
        observation = observation_new

        #next time step
        t += 1

        #end episode
        if done:
            break


    #update finished episode memory with new reward
    #only update action value for actions that were taken, leave others unchanged
    alpha = 0.1
    for i in range(t):
        ActionValues[i][Actions[i]] = ActionValues[i][Actions[i]] * (1-alpha) + EpisodeReward * alpha

    #update weights of NN based on last completed episode
    batch_in = np.empty([t,4]) #input state
    batch_tar = np.empty([t,2]) #target action values
    for i in range(t):
        batch_in[i] = States[i]
        batch_tar[i] = ActionValues[i]
    loss += ActorNet.train_on_batch(batch_in, batch_tar)[0]

    print('Episode {0}, reward = {1}'.format(episode,EpisodeReward))

    TotalReward += EpisodeReward

print('Total reward = {0}'.format(TotalReward))
#ActorNet.save('CPv1_model.h5')

env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0/50, score: 24, e: 1.0
episode: 1/50, score: 15, e: 0.92
episode: 2/50, score: 10, e: 0.83
episode: 3/50, score: 9, e: 0.76
episode: 4/50, score: 16, e: 0.65
episode: 5/50, score: 9, e: 0.59
episode: 6/50, score: 18, e: 0.49
episode: 7/50, score: 18, e: 0.41
episode: 8/50, score: 15, e: 0.36
episode: 9/50, score: 9, e: 0.32
episode: 10/50, score: 9, e: 0.3
episode: 11/50, score: 9, e: 0.27
episode: 12/50, score: 11, e: 0.24
episode: 13/50, score: 40, e: 0.16
episode: 14/50, score: 85, e: 0.069
episode: 15/50, score: 119, e: 0.021
episode: 16/50, score: 76, e: 0.0099
episode: 17/50, score: 99, e: 0.0099
episode: 18/50, score: 62, e: 0.0099
episode: 19/50, score: 42, e: 0.0099
episode: 20/50, score: 61, e: 0.0099
episode: 21/50, score: 78, e: 0.0099
episode: 22/50, score: 109, e: 0.0099
episode: 23/50, score: 88, e: 0.0099
episode: 24/50, score: 76, e: 0.0099
episode: 25/