In [0]:
# Source: https://towardsdatascience.com/reinforcement-learning-with-ohttps://github.com/openai/gym/wiki/Environmentspenai-d445c2c687d2
#
# https://gym.openai.com/envs/#classic_control
# https://github.com/openai/gym/wiki/Environments

# conda install -n py37 gym[atari] -c conda-forge
# conda install -n py37 gym[all] -c conda-forge
# conda install -n py37 atari_py -c conda-forge
# conda install -n py37 pybox2d -c conda-forge

# Acrobot-v1
# CartPole-v0
# MountainCar-v0
# MountainCarContinuous-v0

# Pong-v0
# MsPacman-v0

# CarRacing-v0

# FrozenLake8x8-v0

# DE AQUI

In [0]:
# https://deeplizard.com/learn/video/a-SnJtmBtyA
# https://www.youtube.com/watch?v=nyjbcRQ-uQ8&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv

# https://pythonprogramming.net/deep-q-learning-dqn-reinforcement-learning-python-tutorial/
# https://www.freecodecamp.org/news/an-introduction-to-deep-q-learning-lets-play-doom-54d02d8017d8/
# https://simoninithomas.github.io/Deep_reinforcement_learning_Course/
# https://lilianweng.github.io/lil-log/2018/05/05/implementing-deep-reinforcement-learning-models.html

In [0]:
import numpy as np
import random
import gym
from itertools import count
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [0]:
# Create an environment for a chosen game -- see docs for more games
env = gym.make('FrozenLake-v0', is_slippery=False)
env.close()

In [0]:
# Agent i/o parameters
in_shape = env.observation_space.n # Size of state space
print(f"State space length: {in_shape}")
out_shape = env.action_space.n         # Size of action space
print(f"Action space length: {out_shape}")

State space length: 16
Action space length: 4


In [0]:
def create_agent():
  '''Definition of agent'''
  model = Sequential()
  model.add(Dense(16, input_shape=(in_shape,), activation='relu', kernel_initializer='he_normal', use_bias=False))
  model.add(Dense(64, activation='relu', kernel_initializer='he_normal', use_bias=False))
  model.add(Dense(16, activation='relu', kernel_initializer='he_normal', use_bias=False))
  model.add(Dense(out_shape, activation='linear', kernel_initializer='he_normal', use_bias=False))
  model.compile(loss="hinge", optimizer=Adam(learning_rate=0.005))
  return model

In [0]:
def sample_action(state, eps):
  ''' Generate an action.
      Given the current exploration rate (defined by the number of samples taken
      so far, 'global samples_taken'), the action can be generated either
      by exploration (random) or exploitation (using current knowledge,
      i.e., current model and current state, 'state').
  '''
  # Sample by exploitation or exploration
  if np.random.uniform() > eps:
      state = to_categorical(state, in_shape).reshape(-1, in_shape) # adjust accordinly
      action = np.argmax(net_policy.predict(state))
  else:
      action = env.action_space.sample()
  return action

In [0]:
def init_mem_replay():
  experience = {"state": 0,
                "action": 0,
                "reward": 0,
                "next_state": 0
                }
  return [experience for _ in range(memory_len)]

In [0]:
def push_experience(state, action, reward, next_state, done):
  '''
  Stores experience in replay memory.
  Values of experience are encapsulated into a dictionary.
  Append if replay memory has not yet reached max length, or
  replace oldest register.
  '''
  experience = {"state": state,
                "action": action,
                "reward": reward,
                "next_state": next_state,
                "done": done
                }
  replay_mem[push_count % memory_len] = experience

In [0]:
def decouple_batch(batch):
  states = []
  actions = []
  rewards = []
  next_states = []
  dones = []
  for experience in batch:
    states.append(experience['state'])
    actions.append(experience['action'])
    rewards.append(experience['reward'])
    next_states.append(experience['next_state'])
    dones.append(experience['done'])
  return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

In [0]:
# Train - initialize environment
env = gym.make('FrozenLake-v0', is_slippery=False)

# Parameters
n_episodes = 2000
batch_size = 64
memory_len = 50_000
gamma = 0.99
update_rate = 10

# Exploration/exploitation rate parameters
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.98

# Initialize agent
net_policy = create_agent()
net_target = create_agent()
net_target.set_weights(net_policy.get_weights())

# Variables
replay_mem = init_mem_replay()
eps = EPS_START
push_count = 0
n_wins, max_wins = 0, 0 # Count consecutive wins
loss = 1.

# Start training
for episode in range(n_episodes):
  print(f"Episode:{episode+1:4d}/{n_episodes} --", end=' ')

  # Reset environment and get initial state
  state = env.reset()
  for t_step in count():
    # Probability of exploration/exploitation. And number of samples already taken
    eps *= EPS_DECAY
    if eps < EPS_END:
      eps = EPS_START

    # Simulate one time step
    action = sample_action(state, eps)                # Sample and action with prob eps
    next_state, reward, done, info = env.step(action)      # Evaluate sampled action

    # Adjust reward (custom reward function)
    if done and reward == 0:    # Lost
      reward = -20
    elif done and reward == 1:  # Win
      reward = 20
    else:                       # Just still alive
      reward = -0.1 * t_step

    # Store experience in replay memory
    push_experience(state, action, reward, next_state, done)    # Push new experience into replay memory
    push_count += 1                                             # Update count of pushes into memory
    state = next_state                                          # Update state as given by the last action

    if push_count >= batch_size:
      # sample a random batch from replay memory
      lim_mem = min(push_count, memory_len)
      batch = random.sample(replay_mem[:lim_mem], batch_size)
      batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = decouple_batch(batch)

      # Compute q-values for current and next state, and update for the chosen action
      batch_states = to_categorical(batch_states, in_shape)
      batch_next_states = to_categorical(batch_next_states, in_shape)
      q_vals = net_policy.predict(batch_states)
      q_vals_next = net_target.predict(batch_next_states)

      # Updte Q-values by Bellman equation (general update vs wins)
      for q_ind, (don, act, rew, q_next) in enumerate(zip(batch_dones, batch_actions, batch_rewards, q_vals_next)):
        if not don:
          q_vals[q_ind, act] = rew + gamma * q_next.max()
        else:
          q_vals[q_ind, act] = rew

      # Retrain network
      loss = net_policy.train_on_batch(batch_states, q_vals)

    # Check for episode termination
    if done:
      break

  # Update target network
  if episode % update_rate == 0:
    net_target.set_weights(net_policy.get_weights())
  
  # Print episode's info
  if reward <= 0:
    print(" failed  ", end=' ')
    n_wins = 0
  elif reward > 0:
    print("*success*", end=' ')
    n_wins += 1
  max_wins = max(max_wins, n_wins)
  print(f"(wins={n_wins:2d}/{max_wins:2d})", end=' ')
  print(f"in {t_step+1:3d} t-steps, @state {state:2d}, eps={eps:.2f}({push_count:6d}), loss={loss:.4f}")
  
env.close()

Episode:   1/2000 --  failed   (wins= 0/ 0) in   3 t-steps, @state  5, eps=0.94(     3), loss=1.0000
Episode:   2/2000 --  failed   (wins= 0/ 0) in   5 t-steps, @state 12, eps=0.85(     8), loss=1.0000
Episode:   3/2000 --  failed   (wins= 0/ 0) in   4 t-steps, @state  5, eps=0.78(    12), loss=1.0000
Episode:   4/2000 --  failed   (wins= 0/ 0) in   6 t-steps, @state 12, eps=0.70(    18), loss=1.0000
Episode:   5/2000 --  failed   (wins= 0/ 0) in   2 t-steps, @state  5, eps=0.67(    20), loss=1.0000
Episode:   6/2000 --  failed   (wins= 0/ 0) in  10 t-steps, @state 12, eps=0.55(    30), loss=1.0000
Episode:   7/2000 --  failed   (wins= 0/ 0) in   8 t-steps, @state  7, eps=0.46(    38), loss=1.0000
Episode:   8/2000 --  failed   (wins= 0/ 0) in   2 t-steps, @state  5, eps=0.45(    40), loss=1.0000
Episode:   9/2000 -- *success* (wins= 1/ 1) in   9 t-steps, @state 15, eps=0.37(    49), loss=1.0000
Episode:  10/2000 --  failed   (wins= 0/ 1) in   6 t-steps, @state  7, eps=0.33(    55), lo

In [0]:
# Validate on a series of several consecutive runs
env = gym.make('FrozenLake-v0', is_slippery=True)
wins = 0
max_wins = 0
for i in range(10):
  state = env.reset()
  done = False
  for ind_step in count():
    #env.render()                    # Uncomment to see game running
    state = to_categorical(state, in_shape).reshape(-1, in_shape)
    action = net_policy.predict(state).argmax()
    state, reward, done, info = env.step(action)
    if done:
        break
  if reward == 1:
    #print("Win!")
    wins += 1
  else:
    #print("Lose!")
    wins = 0
  max_wins = max(max_wins, wins)
env.close()

print('Max number of consecutive wins: {}'.format(max_wins))

Max number of consecutive wins: 0


In [0]:
# See output for one single episode
env = gym.make('FrozenLake-v0', is_slippery=True)
state = env.reset()
tot_r, done = 0, False
for ind_step in count():
    env.render()                    # Uncomment to see game running
    state = to_categorical(state, in_shape).reshape(-1, in_shape)
    action = net_policy.predict(state).argmax()
    state, reward, done, info = env.step(action)
    tot_r += reward
    if done:
        break
env.render()        
env.close()
print('Game ended! Total reward: {}'.format(tot_r))


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Game ended! Total reward: 0.0


In [0]:
# Save trained model for future use
#net_policy.save("net_policy.h5")