# Assessment Guide for NICF – Practical Reinforcement Learning for Beginners

## Part 1: Q-Learning

In [None]:
import numpy as np
import gym
import time

lr = 0.33
gamma = 0.8
epsilon = 0.1
episodes = 10000

env = gym.make('FrozenLake-v0')
#env = gym.make("FrozenLake8x8-v0")

In [None]:
N_STATES = env.observation_space.n
print(N_STATES)
print("Action space:", env.action_space)

In [None]:
## Initialize Q Table

Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Q-Learning

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False
    while not done:
        if np.random.random() < epsilon:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s,:])
        s_, r, done, _ = env.step(a)
        Q[s,a] += lr*(r+gamma*np.max(Q[s_,:]) - Q[s,a])
        s = s_

In [None]:
# Print Final Q Table
print(Q)

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)

In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

In [None]:
## Compute Optimal Policy and Value Function

policy = {}
V = np.zeros(N_STATES)
for S in range(N_STATES):
    policy[S] = np.argmax(Q[S,:])
    V[S] = np.max(Q[S,:])
print('Optimal policy :', policy)
print('Optimal value function: ', V)

## Part 2: Q-Learning with Epilson Decay

In [None]:
import numpy as np
import gym
import time

lr = 0.33
lrMin = 0.001
lrDecay = 0.9999
gamma = 0.8
epsilon = 1.0
epsilonMin = 0.001
epsilonDecay = 0.97
episodes = 10000

env = gym.make('FrozenLake-v0')
#env = gym.make("FrozenLake8x8-v0")

In [None]:
## Initialize Q Table

Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Q-Learning

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False

    while not done:
        
        if np.random.random() < epsilon:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s,:])
        
        s_, r, done, _ = env.step(a)
        Q[s,a] += lr*(r+gamma*np.max(Q[s_,:]) - Q[s,a])
        s = s_
        
        if lr > lrMin:
            lr *= lrDecay

        if not r==0 and epsilon > epsilonMin:
            epsilon *= epsilonDecay

In [None]:
### Print Final Q Table
print(Q)

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)



In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

In [None]:
## Compute Optimal Policy and Value Function

policy = {}
V = np.zeros(N_STATES)
for S in range(N_STATES):
    policy[S] = np.argmax(Q[S,:])
    V[S] = np.max(Q[S,:])
print('Optimal policy :', policy)
print('Optimal value function: ', V)

## Part 3: On Policy Q-Learning with Epsilon Decay

In [None]:
import numpy as np
import gym
from IPython.display import clear_output
import time

lr = 0.33
lrMin = 0.001
lrDecay = 0.9999
gamma = 1.0
epsilon = 1.0
epsilonMin = 0.001
epsilonDecay = 0.97
episodes = 2000

env = gym.make('FrozenLake-v0')
#env = gym.make("FrozenLake8x8-v0")

In [None]:
## Initialize Q Table

Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Action Policy

def choose_action(s, Q):
    if np.random.random() < epsilon:
        a = np.random.randint(0, env.action_space.n)
    else:
        a = np.argmax(Q[s,:])
    return a

In [None]:
## Q-Learning

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False
    
    a = choose_action(s, Q)
    while not done:
        s_, r, done, _ = env.step(a)
        a_ = choose_action(s_, Q)
        Q[s,a] = Q[s,a] + lr*(r+gamma*(Q[s_,a_]) - Q[s,a])
        s = s_
        a = a_
        
        if lr > lrMin:
            lr *= lrDecay

        if not r==0 and epsilon > epsilonMin:
            epsilon *= epsilonDecay

print(Q)

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)

print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

In [None]:
## Compute Optimal Policy and Value Function

policy = {}
V = np.zeros(N_STATES)
for S in range(N_STATES):
    policy[S] = np.argmax(Q[S,:])
    V[S] = np.max(Q[S,:])
print('Optimal policy :', policy)
print('Optimal value function: ', V)

## Part 4: Deep Q Network

In [None]:
import os
import gym
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential;
from tensorflow.keras.layers import Dense, Activation;
from tensorflow.keras import initializers;
from tensorflow.keras.optimizers import Adam, SGD;
from tensorflow.keras import backend as K
from tqdm import tqdm, trange
import pandas as pd
%matplotlib inline

In [None]:
def loss(Qtarget, Q):
    return K.sum(K.square(Qtarget - Q))

In [None]:
def frozen_lake(env, e, learning_rate, gamma, episodes, steps):
    # Initialize history memory
    step_list = []
    reward_list = []
    loss_list = []
    e_list = []

    def create_model():
        initializer = initializers.random_uniform(0, 0.1, seed=1)
        model = Sequential([Dense(4, input_dim=env.observation_space.n, 
                        kernel_initializer=initializer,
                        use_bias=False
                                 )])
        model.compile(loss=loss, optimizer=Adam(lr=learning_rate))
        return model

    model = create_model()

    # Transform into one-hot vector
    def OH(l, x):
        return np.identity(l)[x].reshape(1,l)

    for i in trange(episodes):
        # Initialize
        state = env.reset()
        reward_all = 0 #Reward counter
        done = False
        s = 0 #Step counter
        l = 0 #Loss

        for s in range(steps):
            # Choose action randomly or through agent model
            if np.random.rand(1) < e:
                Q = model.predict(OH(16, state), batch_size=1)
                action = env.action_space.sample()
            else:
                Q = model.predict(OH(16, state), batch_size=1)
                action = np.argmax(Q)

            # Take action and obtain new state and reward    
            new_state, reward, done, _ = env.step(action)

            # Adjust reward if done without reaching end
            if done and reward == 0.0: reward = -1

            # Find max-Q for future state
            Q1 = model.predict(OH(16, new_state), batch_size=1)
            maxQ1 = np.max(Q1)
            
            # Bellman Equation
            # Update target for training by adding reward for action and discounted max next state Q-value
            targetQ = Q
            targetQ[0, action] = reward + (gamma * maxQ1)

            # Train on target Q value
            history = model.fit(OH(16, state), targetQ, verbose=False, batch_size=1)

            # Update history and set current state
            l += history.history['loss'][0]
            reward_all += reward
            state = new_state

            if done == True:
                # Reduce e if current episode is successful
                if reward > 0: 
                    e = 1./((i/50) + 10)
                break

        # Update history
        step_list.append(s)
        reward_list.append(reward_all)
        loss_list.append(l/s)
        e_list.append(e)
    print('\nSuccessful episodes: {}'.format(np.sum(np.array(reward_list)>0.0)/episodes))

    window = int(episodes/10)

    plt.figure(figsize=[9,16])
    plt.subplot(411)
    plt.plot(pd.Series(step_list).rolling(window).mean())
    plt.title('Step Moving Average ({}-episode window)'.format(window))
    plt.ylabel('Moves')
    plt.xlabel('Episode')

    plt.subplot(412)
    plt.plot(pd.Series(reward_list).rolling(window).mean())
    plt.title('Reward Moving Average ({}-episode window)'.format(window))
    plt.ylabel('Reward')
    plt.xlabel('Episode')

    plt.subplot(413)
    plt.plot(pd.Series(loss_list).rolling(window).mean())
    plt.title('Loss Moving Average ({}-episode window)'.format(window))
    plt.ylabel('Loss')
    plt.xlabel('Episode')

    plt.subplot(414)
    plt.plot(e_list)
    plt.title('Random Action Parameter')
    plt.ylabel('Chance Random Action')
    plt.xlabel('Episode')

    plt.tight_layout(pad=2)
    plt.show()

In [None]:
#env = gym.make('FrozenLakeNotSlippery-v0')
env = gym.make('FrozenLake-v0')
# Chance of random action
e = 0.1
learning_rate = 0.01
# Discount Rate
gamma = 0.99
# Training Episodes
episodes = 500
# Max Steps per episode
steps = 50

frozen_lake(env, e, learning_rate, gamma, episodes, steps)