In [1]:
import numpy as np

def one_step_lookahead(environment, state, V, discount):
    """
    helper function to calculate the value function
    
    """
    #Creating a vector of dimensionally same size as the number of actions
    action_values=np.zeros(environment.nA)
    
    for action in range(environment.nA):
        
        for probability, next_state, reward, terminated in environment.P[state][action]: #policy
            action_values[action] +=  probability * (reward + discount_factor * V[next_state])
            
    return action_values    

In [2]:
def policy_evaluation(policy, environment, discount_factor=1.0, theta=1e-9, max_iteration=1e9):
    """
    evaluate a policy given a deterministic environment
    
    1)policy : Matrix of size nS*nA. Each cell reprents the probability of 
               taking an action in a particular state
    2)Environment : openAI environment object
    3)discount_factor:
    4)theta: Convergence factor. If the change in value function for all 
             states is below theta, we are done.
    5)max_iterations: To avoid infinite looping.
    
    Returns:
    1)V:The optimum value estimate for the given policy 
    """
    
    evaluation_iterations = 1 # to record the number of iterations
    V=np.zeros(environment.nS)
    
    for i in range(int(max_iterations)):
        delta = 0  #for early stopping
        
        for state in range(environment.nS):
            v=0
            
            for action, action_probability in enumerate(policy[state]):
                
                for state_probability, next_state, reward, terminated in environment.P[state][action]:
                    v+= action_probability * state_probability * (reward + discount_factor *V[next_state])
        
            delta= max(delta, abs(V[state]-v)) #marked
            V[state]=v

        evaluation_iterations +=1
        
        if(delta < theta):
            print('policy evaluated in d% iterations' % evaluation_iterations)
            return V

In [3]:
def policy_iteration(environment, discount_factor=1.0, max_iterations=1e9):
    
    """
    In this function, we would take a random policy and evaluate the optimum 
    value function of the policy ,act greedily on the policy and work for the
    new better policy.
    """
    
    policy=np.ones((environment.nS, environment.nA))/environment.nA
    
    evaulated_policies =1
    
    for i in range(int(max_iterations)):
        
        stable_policy= True
        V=policy_evaluation(policy,environment, discount_factor=discount_factor)
        
        for state in range(environment.nS):
            
            current_action=np.argmax(policy[state])    #error here what if elements are same?
            action_values=one_step_lookahead(environment, state,V ,discount_factor=discount_factor)
            best_action = np.argmax(action_values)
            
            if(current_action != best_action):
                stable_policy =False
                
            policy[state]=np.eye(environment.nA)[best_action]
            
        evaluated_policies +=1
        
        if(stable_policy):
            print('Evaluated %d policies.' % evaluated_policies)
            return policy, V
    

In [4]:
def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
    
    V=np.zeros(environment.nS)
    
    for i in range(int(max_iterations)):
        
        delta=0
        
        for state in range(environment.nS):
            
            action_values=one_step_lookahead(environment, state, V, discount_factor)
            best_action_value=np.max(action_values)
            delta=max(delta,abs(V[state]-best_action_value))
            V[state]=best_action_value
            
        if(delta <theta):
            print('Value iteration converged at iteration #%d' % i)
            break
    
    policy= np.zeros((environment.nS, environment.nA))
    
    for state in  range(environment.nS):
        
        action_values= one_step_lookahead(environment, state, V, discount_factor)
        best_action = np.argmax(action_values)
        policy[state][best_action]=1.0
        
    return policy, V
            


### PythonLearning1: 
The arrays in numpy are somewhat different than those found in real manipulations.
    for eg: 
    1)
        import numpy as np
        a=np.array([1,2,3])
        print(a)    # printing A
        print(a.T)  #printing A transpose
        
    Output:
        array([1,2,3])
        array([1,2,3])
    
    2)
        import numpy as np
        a=np.array([1,2,3])[np.newaxis]
        print(a)    # printing A
        print(a.T)  #printing A transpose

    Output:
        array([1,2,3])
        array([[1],[2],[3]])
 There is no direct way in python language to deal with arrays and transpose of arrays. Therefore we use numpy.
To get more info about the transpose thing, follow this link.

https://stackoverflow.com/questions/5954603/transposing-a-numpy-array
    
   """           

### PythonLearning2:
The output of the code in python:
      for action in range(10):
...     print(action)
   Output:
        0
        1
        2
        3
        4
        5
        6
        7
        8
        9
 which means the range function output starts from 0 and end at 9 < 10.

### PythonLearning3 : 
Enumerate function:

    seasons = ['Spring', 'Summer', 'Fall', 'Winter']
    list(enumerate(seasons))
    [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
