# Q Neural Network

### Dependencies

In [1]:
import os
import sys

import gym
import numpy as np
import tensorflow as tf

### Helpers

In [2]:
def one_hot(length, idx):
    encode = np.zeros(shape=[length])
    encode[idx] = 1.
    return encode

### Environment

In [3]:
env_name = 'FrozenLake8x8-v0'
env = gym.make(env_name)

In [4]:
n_states = env.env.nS
n_actions = env.env.nA
print(f'{env_name.replace("-v0", "")} has {n_states:,} states & {n_actions:,} actions')

FrozenLake8x8 has 64 states & 4 actions


### Network

In [5]:
tf.reset_default_graph()

# inputs & targets (states & actions)
inputs = tf.placeholder(tf.float32, shape=[n_states])
target = tf.placeholder(tf.float32, shape=[1, n_actions])

# reshape
X_reshape = tf.reshape(inputs, shape=[1, n_states])

# weights
weight = tf.Variable(tf.random_normal(shape=[n_states, n_actions], mean=0, stddev=0.4))

# Q value prediction
Q_value = tf.matmul(X_reshape, weight)
predict = tf.argmax(Q_value, axis=1)

### Loss & Optimizer

In [6]:
loss = tf.reduce_mean(tf.squared_difference(target, Q_value))

global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-1)
train = optimizer.minimize(loss, global_step=global_step)

## Training

### Tensorflow's `Session`

In [7]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [8]:
save_dir = f'saved/{env_name}/'
tensorboard_dir = os.path.join(save_dir, 'tensorboard')
logdir = os.path.join(tensorboard_dir, 'summary')
model_dir = os.path.join(save_dir, 'models/')
model_path = os.path.join(model_dir, 'model.ckpt')

saver = tf.train.Saver()
writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)

tf.summary.scalar('loss', loss)
merged = tf.summary.merge_all()

In [9]:
if tf.gfile.Exists(model_dir):
    try:
        sys.stdout.write('INFO: Attempting to load latest checkpoint\n')
        last_ckpt = tf.train.latest_checkpoint(model_dir)
        saver.restore(sess=sess, save_path=last_ckpt)
        sys.stdout.write(f'INFO: Successfully loaded ckeckpoint – {last_ckpt}\n')
        sys.stdout.flush()
    except Exception as e:
        sys.stderr.write(f'WARN: Could not load checkpoint. {e}\n')
        sys.stderr.flush()
else:
    tf.gfile.MakeDirs(model_dir)
    sys.stdout.write(f'Created checkpoint directory — {model_dir}\n')

INFO: Attempting to load latest checkpoint
INFO:tensorflow:Restoring parameters from saved/FrozenLake8x8-v0/models/model.ckpt-259347
INFO: Successfully loaded ckeckpoint – saved/FrozenLake8x8-v0/models/model.ckpt-259347


### Training Hyperparameters

In [10]:
gamma = 0.9
epsilon = 0.1
episodes = 10000
max_trans_per_episode = 50

### Train loop

In [None]:
total_reward, wins = 0, 0
for episode in range(episodes):
    state, done = env.reset(), False
    max_trans = 0
    
    while max_trans < max_trans_per_episode:
        max_trans += 1
        action, Q = sess.run([predict, Q_value], 
                             feed_dict={inputs: one_hot(n_states, state)})
        # Epsilon Greedy Exploration
        if np.random.randn(1) < epsilon:
            action[0] = env.action_space.sample()
        # Take the action
        new_state, reward, done, _ = env.step(action[0])
        # Get Q´ values for the next_state
        new_Q = sess.run(Q_value, feed_dict={inputs: one_hot(n_states, new_state)})
        Q[0, action[0]] = reward + gamma * np.max(new_Q)
        # Train network
        _, _i_global = sess.run([train, global_step], 
                                feed_dict={inputs: one_hot(n_states, state), target: Q})
        state = new_state
        total_reward += reward
        if done:
            wins += 1
            sys.stdout.write(f'\rEpisode: {episode+1:,}\tGlobal steps: {_i_global:,}\tWins: {wins:,}\t'
                             f'Num transistions: {max_trans:,}\tTotal reward: {total_reward}')
            sys.stdout.flush()
            break
    if episode % 1000 == 0:
        saver.save(sess=sess, save_path=model_path, global_step=global_step)
        summary = sess.run(merged, feed_dict={inputs: one_hot(n_states, state), target: Q})
        writer.add_summary(summary=summary, global_step=_i_global)
        print('')

Episode: 1	Global steps: 259,396	Wins: 1	Num transistions: 49	Total reward: 0.0
Episode: 1,001	Global steps: 288,562	Wins: 817	Num transistions: 10	Total reward: 1.0
Episode: 2,001	Global steps: 316,607	Wins: 1,632	Num transistions: 36	Total reward: 1.0
Episode: 2,365	Global steps: 327,256	Wins: 1,925	Num transistions: 27	Total reward: 1.0