In [13]:
import gym
import numpy as np
import time
#make a python file of dynamic_programming_functions as dp.py to import policy-value iteration
from dp import policy_iteration, value_iteration

In [14]:
action_mappings = {
    0: '\u2191', #up
    1: '\u2192', #right 
    2: '\u2193', #down
    3: '\u2190', #left
}

#print(''.join([action_mappings[action] for action in range(4)]))

In [15]:
#Remove the comment portion of the code only if one needs to see the render process 
#and the environment
n_episodes=10000

def play_episodes(environment, n_episodes, policy):
    wins =0
    total_reward=0
#    l=0
#    k=0
    for episode in range(n_episodes):
        terminated=False
        state=environment.reset()
        
        while not terminated:
            
            action=np.argmax(policy[state])
            next_state, reward, terminated, info = environment.step(action)#

#            if l<=10:
#                print(k)
#                environment.render()
#                k+=1

            total_reward+=reward
            state=next_state
            if terminated and reward ==1.0:
                wins +=1

#                print('win')
#        l=l+1
    
    average_reward=total_reward/n_episodes
    
    return wins, total_reward, average_reward


In [16]:
solvers=[('Policy Iteration', policy_iteration),('Value Iteration', value_iteration)]

In [17]:
for iteration_name, iteration_func in solvers:
     
        environment=gym.make('FrozenLake8x8-v0')
        print('Environment Created.')
        start=time.time()
#Print Start and end if one needs to get familiar with time library.
#        print(start)                
        print('Running %s .....' %iteration_name)
        policy, V =iteration_func(environment.env)
        print('Done.')
        end=time.time()
#        print(end)
        
        print('\n Final Policy derived using %s:' %iteration_name)
        print(''.join([action_mappings[action] for action in np.argmax(policy, axis=1)]))
#        print(policy)
        print('%f seconds taken by %s' %(end-start, iteration_name))
        wins, total_reward, average_reward = play_episodes(environment, n_episodes, policy)
        
        print('%s :: number of wins over %d episodes = %d' %(iteration_name,n_episodes, wins))
        print('%s :: average reward over %d episodes = %.2f \n' %(iteration_name, n_episodes, average_reward))


Environment Created.
Running Policy Iteration .....
Policy evaluated in 203 iteration
Policy evaluated in 862 iteration
Policy evaluated in 907 iteration
Policy evaluated in 958 iteration
Policy evaluated in 1065 iteration
Policy evaluated in 936 iteration
Evaluated 7 policies.
Done.

 Final Policy derived using Policy Iteration:
→↓↓↓↓↓↓↓←←←←←←←↓↑↑↑↑↓←←↓↑↑↑→↑↑↓↓↑←↑↑↓→←↓↑↑↑→←↑↑↓↑↑→↑↑↑↑↓↑→↑↑→↓→↑
1.920724 seconds taken by Policy Iteration
Policy Iteration :: number of wins over 10000 episodes = 8830
Policy Iteration :: average reward over 10000 episodes = 0.88 

Environment Created.
Running Value Iteration .....
Value iteration converged at iteration #914
Done.

 Final Policy derived using Value Iteration:
↓↓↓↓↓↓↓↓←←←←←←←↓↑↑↑↑↓←←↓↑↑↑→↑↑↓↓↑←↑↑↓→←↓↑↑↑→←↑↑↓↑↑→↑↑↑↑↓↑→↑↑→↓→↑
0.642869 seconds taken by Value Iteration
Value Iteration :: number of wins over 10000 episodes = 8827
Value Iteration :: average reward over 10000 episodes = 0.88 



# Some links that may help:

https://www.youtube.com/watch?v=5R2vErZn0yw

Navigating a Virtual World Using Dynamic Programming

https://github.com/search?utf8=%E2%9C%93&q=dynamic+programming+RL&type=Commits

Links to RL dynamic programming cases.

https://github.com/josjo80/deep-learning/commit/cdf5d5afc4efb4f4d4c4c0f7d3615cceff1593ef
    
Guided links to some codes.

https://github.com/karim-berrada/Reinforcement_Learning_projects/commit/5746c3b8b2ea61ff905bcb60b6c22bd423121de6

To solve some interesting problems based on Dynamic programming."One site cutting tree problem" using dp.