In [1]:
import numpy as np
import gym
import random
from tqdm import tqdm

In [3]:
env = gym.make("Breakout-v0")

[2018-11-13 00:27:47,448] Making new env: Breakout-v0


## Create and Init Q Table

In [36]:
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))

## Hyperparameters

In [37]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7
discount_rate = 0.618

min_epsilon = 0.01
decay_rate = 0.9999
epsilon = 1.0

## Run Q-Learning Algo


In [38]:
for episode in tqdm(range(total_episodes)):
    
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        
        greedy_probabiliy = random.uniform(0, 1)
        
        if greedy_probabiliy > epsilon:
            # choose action with highest Q value for that state
            action = np.argmax(qtable[state,:])
        else:
            # randomly pick
            action = env.action_space.sample()
            
        new_state, reward, done, _ = env.step(action)
        
        #update q table
        qtable[state, action] = qtable[state, action] + \
            learning_rate * (reward + discount_rate * np.max(qtable[new_state,:]) - qtable[state, action]) 
            
        state = new_state
        
        if done:
            break
            
    epsilon = max(min_epsilon, epsilon*decay_rate)

100%|██████████| 50000/50000 [00:17<00:00, 2923.27it/s]


## See Q Table

In [39]:
qtable

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -1.89494435,  -1.44813002,  -1.89494435,  -1.44813002,
         -0.72512948, -10.44813002],
       [ -0.72512948,   0.4447743 ,  -0.72512948,   0.4447743 ,
          2.33782249,  -8.5552257 ],
       ...,
       [  2.33782249,   5.40100727,   2.33782249,   0.4447743 ,
         -6.66217751,  -6.66217751],
       [ -1.89494435,  -1.44813002,  -1.89494435,  -1.44813002,
        -10.89494435, -10.89494435],
       [ 18.37802094,  10.35761694,  18.37802094,  31.35602094,
          9.37802094,   9.37802094]])

## Play

In [40]:
env.reset()
rewards = []

for episode in tqdm(range(3)):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        new_state, reward, done, _ = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            break
        state = new_state
        
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

100%|██████████| 3/3 [00:00<00:00, 4484.29it/s]

Score over time: 0.33



