In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import gym # openai to render environment for agent to take action in and receive feedback in
import random
import numpy as np
from collections import deque # for memory of agent
from keras.models import Sequential # to build neural network for aproximating optimal Q
from keras.layers import Dense # theta weights - weights and bias of neurons in between layers of dense network
from keras.optimizers import Adam # stochastic gradient descent

#### Set Parameters

In [2]:
env = gym.make("CartPole-v0")

In [3]:
# hyperparameters related to size of state and size of actions
state_size = env.observation_space.shape[0]
state_size

4

In [4]:
action_size = env.action_space.n
action_size

2

In [5]:
# hyperparameter for gradient descent (vary by powers of 2)
batch_size = 32
n_episodes = 1001
output_dir = "model_output/cartpole"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

The idea is that the number of games we play, the more data we get for training. In each episode, we will randomly remember the things that happened in those previous episodes.

#### Agent Class

In [18]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        # our agent's memory
        # prevents us from going over every single event that's happened. Too Slow!
        self.memory = deque(maxlen=2000)
        
        '''hyperparameters''' 
        self.gamma = 0.95 # discount factor of future rewards
        
        # two modes of action: 
            # exploitation: take best possible action based on whats been learned
            # exploration: to explore the environment more and find new actions
        self.epsilon = 1.0 # initial exploration rate of agent 
        self.epsilon_decay = 0.995 # slowly shifts from exploring to exploitation
        self.epsilon_min = 0.01 # lowest exploration percent can decay to
        
        self.learning_rate = 0.001 # stochastic gradient descent step size
        
        self.model = self._build_model() # ensures private method can only be used by this particular instance of a class
        
    def _build_model(self): # where we define dense neural network for approx Q*
        model = Sequential()
        
        model.add(Dense(24, input_dim = self.state_size, activation="relu")) # hidden layer
        model.add(Dense(24, activation="relu")) # hidden layer
        
        # output layer: as many neurons as possible actions
        model.add(Dense(self.action_size, activation="linear")) # reason for linear: we are directly modeling actions, no abstract probability
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) # mse works for this agent, but cross-entropy might work better for others. mse is not usually first choice
        
        return model
    
    # REALLY IMPORTANT: takes in state, action, reward, and next_state at current time step
    # to model what will happen in next_state and what reward we can expect to receive 
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    # figuring out what action to take based on state
    def act(self, state):
        '''
        explore randomly or exploit information accrued in dense network
        as epsilon decays exploit will be more likely
        '''
        
        # explore
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        #exploit
        act_values = self.model.predict(state) # use theta weights and predict method on our model inside the agent to guess best course of action to maz future reward
        return np.argmax(act_values[0])
    
    # bulk of agent defs
    def replay(self, batch_size):
        
        # samples from the deque of memory
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            # if reached max time steps or by dying, then done
            target = reward
            if not done:
                # reward plus estimates of future reward using neural network and next state
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            
            # map maximized future reward to current reward with theta
            target_f = self.model.predict(state) # use neural network to estimate target given current state
            target_f[0][action] = target # map to future state
            
            self.model.fit(state, target_f, epochs=1, verbose=0) # train model with X: current state, Y: future reward, 
        
        if self.epsilon > self.epsilon_min:
            # decay our exploration rate
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)

#### Create an Instance of the Game

In [19]:
agent = Agent(state_size, action_size)

#### Agent interacts with Environment

In [20]:
done = False
for e in range(n_episodes):
    
    # start each episode at beginning state
    state = env.reset()
    
    # transpose state to fit nicely with DL network
    state = np.reshape(state, [1, state_size])
    
    # iterate over time steps of game
    for time in range(5000):
        
        env.render()
        action = agent.act(state)
        # returned values from taking a step forward
        next_state, reward, done, _ = env.step(action)
        # if we hit the time step 5000, reward is normal, else if we hit end, our reward is -10 for dying
        reward = reward if not done else -10
        
        next_state = np.reshape(next_state, [1, state_size])
        
        #
        agent.remember(state, action, reward, next_state, done)
        
        # moved into next state
        state = next_state
        
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, n_episodes, time, agent.epsilon))
            break
        
    if len(agent.memory) > batch_size:
        # train our theta
        agent.replay(batch_size)
        
    if e % 50 == 0:
        # save our model params at different chapters to hold onto if agent experiences regression
        agent.save(output_dir + "weights_" + "{:04d}".format(e) + ".hdf5")

episode: 0/1001, score: 15, e: 1.0
episode: 1/1001, score: 14, e: 1.0
episode: 2/1001, score: 49, e: 1.0
episode: 3/1001, score: 10, e: 0.99
episode: 4/1001, score: 21, e: 0.99
episode: 5/1001, score: 18, e: 0.99
episode: 6/1001, score: 15, e: 0.98
episode: 7/1001, score: 19, e: 0.98
episode: 8/1001, score: 24, e: 0.97
episode: 9/1001, score: 12, e: 0.97
episode: 10/1001, score: 14, e: 0.96
episode: 11/1001, score: 15, e: 0.96
episode: 12/1001, score: 37, e: 0.95
episode: 13/1001, score: 28, e: 0.95
episode: 14/1001, score: 46, e: 0.94
episode: 15/1001, score: 12, e: 0.94
episode: 16/1001, score: 46, e: 0.93
episode: 17/1001, score: 22, e: 0.93
episode: 18/1001, score: 8, e: 0.92
episode: 19/1001, score: 13, e: 0.92
episode: 20/1001, score: 10, e: 0.91
episode: 21/1001, score: 50, e: 0.91
episode: 22/1001, score: 33, e: 0.9
episode: 23/1001, score: 10, e: 0.9
episode: 24/1001, score: 13, e: 0.9
episode: 25/1001, score: 15, e: 0.89
episode: 26/1001, score: 8, e: 0.89
episode: 27/1001, s

episode: 220/1001, score: 91, e: 0.34
episode: 221/1001, score: 43, e: 0.33
episode: 222/1001, score: 39, e: 0.33
episode: 223/1001, score: 19, e: 0.33
episode: 224/1001, score: 78, e: 0.33
episode: 225/1001, score: 71, e: 0.33
episode: 226/1001, score: 67, e: 0.33
episode: 227/1001, score: 44, e: 0.32
episode: 228/1001, score: 75, e: 0.32
episode: 229/1001, score: 57, e: 0.32
episode: 230/1001, score: 51, e: 0.32
episode: 231/1001, score: 30, e: 0.32
episode: 232/1001, score: 33, e: 0.32
episode: 233/1001, score: 83, e: 0.31
episode: 234/1001, score: 67, e: 0.31
episode: 235/1001, score: 49, e: 0.31
episode: 236/1001, score: 29, e: 0.31
episode: 237/1001, score: 79, e: 0.31
episode: 238/1001, score: 52, e: 0.31
episode: 239/1001, score: 44, e: 0.3
episode: 240/1001, score: 128, e: 0.3
episode: 241/1001, score: 28, e: 0.3
episode: 242/1001, score: 36, e: 0.3
episode: 243/1001, score: 91, e: 0.3
episode: 244/1001, score: 85, e: 0.3
episode: 245/1001, score: 51, e: 0.3
episode: 246/1001,

KeyboardInterrupt: 