In [1]:
# Implementation of q-table solution to the hello world example from: https://towardsdatascience.com/qrash-course-deep-q-networks-from-the-ground-up-1bbda41d3677

For this example, there are 16 possible states. The last state i.e. the state where all slots have been filled up is a terminal state.

Using the Open AI, gym model we'll build up the reward table

In [None]:
import numpy as np

In [82]:
class HelloRL:
    def __init__(self):
        self.num_actions = 4
        self.num_states = 16 # 16 possible states
        # (prob, reward, next_state, done)
        self.P = {state: {action: [] for action in range(self.num_actions)} for state in range(self.num_states)}
        
        for state in range(self.num_states):
            for action in range(self.num_actions):
                if state == 15:
                    self.P[state][action] = (1.0, 0, state, True)
                else:
                    _action_mod = 2**(self.num_actions - 1 - action)
                    reward = 1 if _action_mod & state == 0 else -1
                    if reward == -1:
                        self.P[state][action] = (1.0, reward, state, False)
                    else:
                        next_state = state + _action_mod
                        self.P[state][action] = (1.0, reward, next_state, False)
        self.current_state = 0
    
    def take_action(self, action):
        if action >= self.num_actions:
            raise ValueError("Invalid action: %d" % action)
        
        res = self.P[self.current_state][action]
        self.current_state = res[2]
        return res
    
    def reset(self):
        self.current_state = 0
                    

In [83]:
class HelloAgent:
    def __init__(self, env, alpha=0.1, epsilon=0.1, gamma=0.6):
        """
        alpha: learning rate
        epislon: exploitation vs exploration,
        gamma: discount factor
        """
        self.qtable = np.zeros((env.num_states, env.num_actions))
        self.env = env
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
    
    def step(self):
        state = self.env.current_state
        action = self.pick_action(state)
        prob, reward, next_state, done = self.env.take_action(action)
        self.update_q_table(state, action, next_state, reward)
        
        return reward, done
    
    def update_q_table(self, state, action, next_state, reward):
        old_value = self.qtable[state, action]
        next_max = np.max(self.qtable[next_state, :])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.qtable[state, action] = new_value
        
    
    def pick_action(self, state):
        if np.random.random() < self.epsilon:
            # randomly pick an action
            return np.random.randint(0, self.env.num_actions)
        else:
            return np.argmax(self.qtable[state, :])
    
    def reset(self):
        self.env.reset()
        

In [101]:
env = HelloRL()
agent = HelloAgent(env)
num_epochs = 100000
all_penalties = []
step_count = []
total_rewards = []
for idx in range(num_epochs):
    penalties = 0
    steps = 0
    rewards = 0
    while True:
        reward, done = agent.step()
        rewards += reward
        if reward < 0:
            penalties += 1
        steps += 1
        
        if done:
            break
    agent.reset()
    all_penalties.append(penalties)
    step_count.append(steps)
    total_rewards.append(rewards)

In [108]:
min(total_rewards)

-2