In [0]:
# Source: https://towardsdatascience.com/reinforcement-learning-with-ohttps://github.com/openai/gym/wiki/Environmentspenai-d445c2c687d2
#
# https://gym.openai.com/envs/#classic_control
# https://github.com/openai/gym/wiki/Environments

# conda install -n py37 gym[atari] -c conda-forge
# conda install -n py37 gym[all] -c conda-forge
# conda install -n py37 atari_py -c conda-forge
# conda install -n py37 pybox2d -c conda-forge

# Acrobot-v1
# CartPole-v0
# MountainCar-v0
# MountainCarContinuous-v0

# Pong-v0
# MsPacman-v0

# CarRacing-v0

# FrozenLake8x8-v0

# DE AQUI

In [0]:
# https://deeplizard.com/learn/video/a-SnJtmBtyA
# https://www.youtube.com/watch?v=nyjbcRQ-uQ8&list=PLZbbT5o_s2xoWNVdDudn51XM8lOuZ_Njv

# https://arxiv.org/abs/1709# https://deeplizard.com/learn/video/a-SnJtmBtyA.06560

# https://pythonprogramming.net/deep-q-l# https://deeplizard.com/learn/video/a-SnJtmBtyAearning-dqn-reinforcement-learning-python-tutorial/
# https://www.freecodecamp.org/news/an-introduction-to-deep-q-learning-lets-play-doom-54d02d8017d8/
# https://simoninithomas.github.io/Deep_reinforcement_learning_Course/
# https://lilianweng.github.io/lil-log/2018/05/05/implementing-deep-reinforcement-learning-models.html

In [0]:
import numpy as np
import random
import gym
from itertools import count
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [4]:
# Create an environment for a chosen game -- see docs for more games
env = gym.make('FrozenLake-v0', is_slippery=False)

# Show a few random moves on the environment
state = env.reset()
for t in range(5):
    #env.render()
    print("State:", state)
    action = env.action_space.sample()
    print("Action:", action)
    state, reward, done, info = env.step(action)
env.close()

State: 0
Action: 3
State: 0
Action: 1
State: 4
Action: 2
State: 5
Action: 1
State: 5
Action: 1


In [5]:
# PARAMETERS

# Networks (agent)
in_shape = env.observation_space.n # Size of state space
print(f"State space length: {in_shape}")
out_shape = env.action_space.n         # Size of action space
print(f"Action space length: {out_shape}")

# Training
n_episodes = 5000       # Numper of epochs
memory_len = 50_000    # Length of replay memory
batch_size = 128
gamma      = 0.9        # Discount factor for Bellman equation

# Exploration rate
EPS_START = 0.9
EPS_END = 0.01
EPS_DECAY = 2000

State space length: 16
Action space length: 4


In [0]:
def create_agent():
  '''Definition of agent'''
  model = Sequential()
  model.add(Dense(256, input_shape=(in_shape,), activation='relu', kernel_initializer='he_uniform'))
  model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
  model.add(Dense(16, activation='relu', kernel_initializer='he_uniform'))
  model.add(Dense(out_shape, activation='linear', kernel_initializer='he_uniform'))
  model.compile(loss="hinge", optimizer='rmsprop')
  return model

In [7]:
# Create both networks
net_policy = create_agent()
net_target = create_agent()
net_target.set_weights(net_policy.get_weights())

net_policy.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               4352      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_2 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 68        
Total params: 21,908
Trainable params: 21,908
Non-trainable params: 0
_________________________________________________________________


In [0]:
def sample_action(state, eps):
  ''' Generate an action.
      Given the current exploration rate (defined by the number of samples taken
      so far, 'global samples_taken'), the action can be generated either
      by exploration (random) or exploitation (using current knowledge,
      i.e., current model and current state, 'state').
  '''
  # Sample by exploitation or exploration
  if np.random.uniform() > eps:
      state = to_categorical(state, in_shape).reshape(-1, in_shape)
      action = np.argmax(net_policy.predict(state))
  else:
      action = env.action_space.sample()
  return action

In [0]:
def init_mem_replay():
  experience = {"state": 0,
                "action": 0,
                "reward": 0,
                "next_state": 0
                }
  return [experience for _ in range(memory_len)]

In [0]:
def push_experience(state, action, reward, next_state):
  '''
  Stores experience in replay memory.
  Values of experience are encapsulated into a dictionary.
  Append if replay memory has not yet reached max length, or
  replace oldest register.
  '''
  experience = {"state": state,
                "action": action,
                "reward": reward,
                "next_state": next_state
                }
  replay_mem[push_count % memory_len] = experience

In [0]:
def decouple_batch(batch):
  states = []
  actions = []
  rewards = []
  next_states = []
  for experience in batch:
    states.append(experience['state'])
    actions.append(experience['action'])
    rewards.append(experience['reward'])
    next_states.append(experience['next_state'])
  return np.array(states), np.array(actions), np.array(rewards), np.array(next_states)

In [12]:
# Train - initialize environment
env = gym.make('FrozenLake-v0')

# Variables
samples_taken = 0
replay_mem = init_mem_replay()
push_count = 0
n_wins, max_wins = 0, 0 # Count consecutive wins
loss = 1
for episode in range(n_episodes):
  print(f"Episode:{episode+1:4d}/{n_episodes} --", end=' ')

  # Start episode and environment. Get initial state
  state = env.reset()
  for t_step in count():

    # Compute probability of exploration/exploitation given number of samples already taken
    eps = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * samples_taken / EPS_DECAY)
    eps = max(eps, EPS_END)
    samples_taken += 1

    # Simulate one time step
    action = sample_action(state, eps)                    # Sample and action with prob eps
    next_state, reward, done, info = env.step(action)      # Evaluate sampled action

    # Evaluate status and adjust reward
    if done and reward == 0:
      reward = -100                 # we lost, give larger negative reward
    elif not done and reward == 0:
      reward = t_step               # still in the game, reward = num of time steps alive
    elif done and reward != 0:
      reward = 100                  # we won, give large positive reward
    
    # Populate replay memory
    push_experience(state, action, reward, next_state)    # Push new experience into replay memory
    push_count += 1                                       # Update count of pushes into memory
    state = next_state                                    # Update state as given by the last action

    # Optimization
    if push_count >= batch_size:
      # sample a random batch from replay memory
      batch = random.sample(replay_mem, batch_size)
      batch_states, batch_actions, batch_rewards, batch_next_states = decouple_batch(batch)

      # Compute q-values for current and next state, and update for the chosen action
      batch_states = to_categorical(batch_states, in_shape)
      batch_next_states = to_categorical(batch_next_states, in_shape)
      q_vals = net_policy.predict(batch_states)
      q_vals_next = net_target.predict(batch_next_states)
      
      # Updte Q-values by Bellman equation
      if not done:
        q_vals[range(len(q_vals)), batch_actions] = batch_rewards + (gamma * q_vals_next.max(axis=1))
      else:
        q_vals[range(len(q_vals)), batch_actions] = batch_rewards
          
      # Optimize policy network on current batch
      loss = net_policy.train_on_batch(batch_states, q_vals)

    # Update targets network
    #if done or push_count % 50 == 0:
    #    net_target.set_weights(net_policy.get_weights())

    # Check for episode termination
    if done:
      break
  
  net_target.set_weights(net_policy.get_weights())
  
  # Print episode's info
  if reward < 0:
    print(f" failed  ", end=' ')
    n_wins = 0
  elif reward > 0:
    print("*success*", end=' ')
    n_wins += 1
  max_wins = max(max_wins, n_wins)
  print(f"(wins={n_wins:2d}/{max_wins:2d})", end=' ')
  print(f"in {t_step+1:2d} t-steps, @state {state:2d}, eps={eps:.2f}({samples_taken}), loss={loss:.4f}")
    
env.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode:   2/5000 --  failed   (wins= 0/ 0) in  5 t-steps, @state  5, eps=0.89(23), loss=1.0000
Episode:   3/5000 --  failed   (wins= 0/ 0) in  7 t-steps, @state  5, eps=0.89(30), loss=1.0000
Episode:   4/5000 --  failed   (wins= 0/ 0) in  6 t-steps, @state  5, eps=0.88(36), loss=1.0000
Episode:   5/5000 --  failed   (wins= 0/ 0) in  4 t-steps, @state  7, eps=0.88(40), loss=1.0000
Episode:   6/5000 --  failed   (wins= 0/ 0) in 17 t-steps, @state  7, eps=0.88(57), loss=1.0000
Episode:   7/5000 --  failed   (wins= 0/ 0) in 14 t-steps, @state  5, eps=0.87(71), loss=1.0000
Episode:   8/5000 --  failed   (wins= 0/ 0) in 13 t-steps, @state  5, eps=0.86(84), loss=1.0000
Episode:   9/5000 --  failed   (wins= 0/ 0) in  7 t-steps, @state  5, eps=0.86(91), loss=1.0000
Episode:  10/5000 --  failed   (wins= 0/ 0) in  2 t-steps, @state  5, eps=0.86(93), loss=1.0000
Episode:  11/5000 --  failed   (wins= 0/ 0) in 11 t-steps, @state 12, e

In [19]:
# Once trained, put it to play
state = env.reset()
tot_r, done = 0, False
for ind_step in count():
    env.render()                    # Uncomment to see game running
    state = to_categorical(state, in_shape).reshape(-1, in_shape)
    action = net_policy.predict(state).argmax()
    state, reward, done, info = env.step(action)
    tot_r += reward
    if done:
        break
env.render()        
env.close()
print('Game ended! Total reward: {}'.format(tot_r))


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Game ended! Total reward: 1.0


In [0]:
# Save trained model for future use
#net_policy.save("net_policy.h5")

In [24]:
max_wins = 0
for i in range(1000):
  state = env.reset()
  done = False
  wins = 0
  for ind_step in count():
    #env.render()                    # Uncomment to see game running
    state = to_categorical(state, in_shape).reshape(-1, in_shape)
    action = net_policy.predict(state).argmax()
    state, reward, done, info = env.step(action)
    if done:
        break
  if reward == 1:
    wins += 1
  else:
    wins = 0
  max_wins = max(max_wins, wins)
env.close()

print('Max number of consecutive wins: {}'.format(max_wins))

Max number of consecutive wins: 1
