In [10]:
from env import FootballSkillsEnv
from model import model
from policyIterationAlgo import policyIterationAlgo
from valueIterationAlgo import valueIterationAlgo
import numpy as np

'''
    Key Environment Methods to Use:
    - env.state_to_index(state_tuple): Converts (x, y, has_shot) tuple to integer index
    - env.index_to_state(index): Converts in   teger index back to (x, y, has_shot) tuple
    - env.get_transitions_at_time(state, action, time_step=None): Default method for accessing transitions.
    - env._is_terminal(state): Check if state is terminal (has_shot=True)
    - env._get_reward(ball_pos, action, player_pos): Get reward for transition
    - env.reset(seed=None): Reset environment to initial state, returns (observation, info)
    - env.step(action): Execute action, returns (obs, reward, done, truncated, info)
    - env.get_gif(policy, seed=20, filename="output.gif"): Generate GIF visualization 
      of policy execution from given seed
    
    Key Env Variables Notes:
    - env.observation_space.n: Total number of states (use env.grid_size^2 * 2)
    - env.action_space.n: Total number of actions (7 actions: 4 movement + 3 shooting)
    - env.grid_size: Total number of rows in the grid
'''
print("Starting Environment")

Starting Environment


In [2]:
optimal_policy_pi, optimal_valueFn_pi,outer_num_iterations_pi, inner_num_iterations_pi, calls_to_getTransisions_fn_pi = policyIterationAlgo(logEnabled = False)

print(f"The policy iteration converged after {outer_num_iterations_pi} outer_iterations, {inner_num_iterations_pi} inner_iterations and {calls_to_getTransisions_fn_pi} calls to getTransisions_fn")

Starting Policy Iteration Algorithm
Total number of possible states : 800
Total actions :  7
Shape of policy matrix (800,) and shape of valueFn (800,)
Errors in policy_state : 798
Errors in policy_state : 87
Errors in policy_state : 57
Errors in policy_state : 39
Errors in policy_state : 30
Errors in policy_state : 34
Errors in policy_state : 37
Errors in policy_state : 39
Errors in policy_state : 35
Errors in policy_state : 30
Errors in policy_state : 29
Errors in policy_state : 28
Errors in policy_state : 27
Errors in policy_state : 24
Errors in policy_state : 20
Errors in policy_state : 20
Errors in policy_state : 17
Errors in policy_state : 14
Errors in policy_state : 12
Errors in policy_state : 10
Errors in policy_state : 8
Errors in policy_state : 6
Errors in policy_state : 4
Errors in policy_state : 2
Errors in policy_state : 0
Count of total number of calls made to the  env.get_transitions_at_time is :  406400
Starting rollout from position: (0, 10, 0)

Reached terminal state.


In [3]:
optimal_policy_vi, optimal_valueFn_vi, outer_num_iterations_vi, inner_num_iterations_vi, calls_to_getTransisions_fn_vi = valueIterationAlgo()

print(f"The value iteration converged after {outer_num_iterations_vi} outer_iterations, {inner_num_iterations_vi} inner_iterations and {calls_to_getTransisions_fn_vi} calls to getTransisions_fn")

Total number of possible states : 800
Total actions :  7
Shape of policy matrix (800,) and shape of valueFn (800,)
Count of total number of calls made to the  env.get_transitions_at_time is :  173600
Starting rollout from position: (0, 10, 0)

Reached terminal state.
Episode GIF saved to ./output_seeds/VIOutput.gif
The value iteration converged after 1 outer_iterations, 30 inner_iterations and 173600 calls to getTransisions_fn


In [None]:
# 2. 
# The policy iteration converged after 25 outer_iterations, 333 inner_iterations and 406400 calls to getTransisions_fn

# The value iteration converged after 1 outer_iterations, 30 inner_iterations and 173600 calls to getTransisions_fn

In [5]:
# 3. are the 2 policies identical :: optimal_policy_vi and optimal_policy_pi
def comparePolicies(policy1, policy2, model = model):
    cnt_of_dissimarlities = 0
    model = model()

    for i in range(model.maxStateSize):
        if policy1[i] != policy2[i]:
            cnt_of_dissimarlities += 1

    print("Count of dissimilarities :", cnt_of_dissimarlities)

comparePolicies(optimal_policy_pi, optimal_policy_vi)

Count of dissimilarities : 0


In [6]:
def getEpisodeRewards(env, obs, policy):
    episode_reward = 0
    while True:
        state_index = env.state_to_index(obs)
        action = policy[state_index]
        
        if action == -1:       
            print("No valid action found!")
            break
        
        obs, reward, done, _, _ = env.step(action)
        episode_reward += reward
        
        if done:
            break
    return episode_reward

def getEpisodesRewardMean(env, episodeCount, policy):
    total_rewards = []

    for episode in range(episodeCount):
        my_seed = np.random.randint(0, 30)
        obs, _ = env.reset(seed= my_seed)
        
        # try to evaluate the episode starting from obs
        total_rewards.append(getEpisodeRewards(env, obs, policy))

    mean_reward = np.mean(total_rewards)
    variance_reward = np.var(total_rewards)
    
    return mean_reward, variance_reward

In [7]:
# 4. Evaluate the performance of each policy by running 20 episodes with different seeds
# Given the optimal policies for both VI and PI
def compareMeanAndVarianceOfRewards(policy1, policy2, env=FootballSkillsEnv, model=model):
    
    env = FootballSkillsEnv(render_mode='gif')
    model = model()
    
    meanPI, variancePI = getEpisodesRewardMean(env, model.episodeCount, policy1)
    meanVI, varianceVI = getEpisodesRewardMean(env, model.episodeCount, policy2)

    print("Policy Iteration :: Mean : ", meanPI, " Variance : ", variancePI)
    print("Value Iteration :: Mean : ", meanVI, " Variance : ", varianceVI)
    # Deterministic env → variance = 0 after convergence.
    
    if(meanPI == meanVI and variancePI == varianceVI):
        print("mean and variance of policies are same")
    else:
        print("mean and variance of policies are not same")

compareMeanAndVarianceOfRewards(optimal_policy_pi, optimal_policy_vi)

Policy Iteration :: Mean :  52.0  Variance :  0.0
Value Iteration :: Mean :  52.0  Variance :  0.0
mean and variance of policies are same


In [11]:
discountFactors = [0.3, 0.5]
model = model()

for discount_factor in discountFactors:
    model.discount_factor = discount_factor
    
    optimal_policy_pi, optimal_valueFn_pi,outer_num_iterations_pi, inner_num_iterations_pi, calls_to_getTransisions_fn_pi = policyIterationAlgo(logEnabled = False)
    print(f"The policy iteration converged after {outer_num_iterations_pi} outer_iterations, {inner_num_iterations_pi} inner_iterations and {calls_to_getTransisions_fn_pi} calls to getTransisions_fn")
    
    optimal_policy_vi, optimal_valueFn_vi, outer_num_iterations_vi, inner_num_iterations_vi, calls_to_getTransisions_fn_vi = valueIterationAlgo(logEnabled = False)
    print(f"The value iteration converged after {outer_num_iterations_vi} outer_iterations, {inner_num_iterations_vi} inner_iterations and {calls_to_getTransisions_fn_vi} calls to getTransisions_fn")
    
    comparePolicies(optimal_policy_pi, optimal_policy_vi)
    compareMeanAndVarianceOfRewards(optimal_policy_pi, optimal_policy_vi)

Starting Policy Iteration Algorithm
Total number of possible states : 800
Total actions :  7
Shape of policy matrix (800,) and shape of valueFn (800,)
Errors in policy_state : 798
Errors in policy_state : 87
Errors in policy_state : 57
Errors in policy_state : 39
Errors in policy_state : 30
Errors in policy_state : 34
Errors in policy_state : 37
Errors in policy_state : 39
Errors in policy_state : 35
Errors in policy_state : 30
Errors in policy_state : 29
Errors in policy_state : 28
Errors in policy_state : 27
Errors in policy_state : 24
Errors in policy_state : 20
Errors in policy_state : 20
Errors in policy_state : 17
Errors in policy_state : 14
Errors in policy_state : 12
Errors in policy_state : 10
Errors in policy_state : 8
Errors in policy_state : 6
Errors in policy_state : 4
Errors in policy_state : 2
Errors in policy_state : 0
Count of total number of calls made to the  env.get_transitions_at_time is :  406400
Starting rollout from position: (0, 10, 0)

Reached terminal state.
