# Model-based reinforcement learning
This notebook implements policy iteration and value iteration algorithms. The algorithms are evaluated on the [FrozenLake](https://www.gymlibrary.dev/environments/toy_text/frozen_lake/) provided by the OpenAI gym. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=False)

# Policy iteration


In [None]:
class PolicyIteration:
    def __init__(self, env):
        self.env = env
    
    
    def __call__(self, gamma=0.9, beta=500, epsilon=1e-5):
        n_states = env.observation_space.n
        n_actions = env.action_space.n
        V = np.zeros(n_states)
        policy = np.random.randint(0, n_actions, n_states)

        def policy_evaluation():
            while True:
                delta = 0
                for state in range(n_states):
                    v_previous = V[state]
                    v_current = 0
                    action = policy[state]
                    for prob, next_state, reward, done in self.env.P[state][action]:
                        if done:
                            v_current += prob*reward
                            V[next_state] = 0
                        else:
                            v_current += prob*(reward + gamma * V[next_state])
                    delta = max(delta, np.abs(v_current - v_previous))
                    V[state] = v_current
                if delta < epsilon:
                    break
                        
        
        def policy_improvement():
            policy_stable = True
            for state in range(n_states):
                optimal_action = policy[state]
                Qs = np.zeros(n_actions)
                for action in range(n_actions):
                    for prob, next_state, reward, done in self.env.P[state][action]:
                        if done:
                            Qs[action] += prob*reward
                        else:
                            Qs[action] += prob*(reward + gamma*V[next_state])
                policy[state] = np.argmax(Qs)
                if optimal_action != policy[state]:
                    policy_stable = False


            return policy_stable
                
        
        while True:
            policy_evaluation()
            policy_stable = policy_improvement()
            if policy_stable:
                break
        
        return V, policy

In [None]:
policy_iteration = PolicyIteration(env)
V, policy = policy_iteration(gamma=0.9, beta=500)
print(V)

# Value iteration

In [None]:
class ValueIteration:
    def __init__(self, env):
        self.env = env

    
    def __call__(self, gamma=0.9, beta=500, epslion=1e-5):
        n_states = env.observation_space.n
        n_actions = env.action_space.n
        V = np.zeros(n_states)

        def compute_action_value(state):
            qA = np.zeros(n_actions)
            for action in range(n_actions):
                for prob, next_state, reward, done in self.env.P[state][action]:
                    qA[action] += prob * (reward + gamma * V[next_state])
            return qA


        def compute_softmax_policy(beta):
            policy = np.zeros([n_states, n_actions])
            for state in range(n_states):
                policy[state] = beta*compute_action_value(state)
            policy -= policy.max(axis=1, keepdims=True)
            policy = np.exp(policy) / np.exp(policy).sum(axis=1, keepdims=True)

            return policy
    
        while True:
            delta = 0
            for state in range(n_states):
                qA = compute_action_value(state)
                max_q = qA.max()
                delta = max(delta, np.abs(max_q - V[state]))
                V[state] = max_q
            if delta < epslion:
                break

        policy = compute_softmax_policy(beta)

        return V, policy

In [None]:
value_iteration = ValueIteration(env)
V, policy = value_iteration(gamma=0.9, beta=500)
print(V)

In [None]:
state = env.reset()
done = False
total_rewards = 0
while not done:
    print(env.render(mode='ansi'))
    action = np.random.multinomial(1, policy[state]).argmax()
    state, reward, done, _ = env.step(action)
    total_rewards += reward
print(env.render(mode='ansi'))
print('total rewards: %f' % (total_rewards))