In [6]:
from env import FootballSkillsEnv
from model import model
from policyIterationAlgo import policyIterationAlgo
from valueIterationAlgo import valueIterationAlgo
import numpy as np

'''
    Key Environment Methods to Use:
    - env.state_to_index(state_tuple): Converts (x, y, has_shot) tuple to integer index
    - env.index_to_state(index): Converts in   teger index back to (x, y, has_shot) tuple
    - env.get_transitions_at_time(state, action, time_step=None): Default method for accessing transitions.
    - env._is_terminal(state): Check if state is terminal (has_shot=True)
    - env._get_reward(ball_pos, action, player_pos): Get reward for transition
    - env.reset(seed=None): Reset environment to initial state, returns (observation, info)
    - env.step(action): Execute action, returns (obs, reward, done, truncated, info)
    - env.get_gif(policy, seed=20, filename="output.gif"): Generate GIF visualization 
      of policy execution from given seed
    
    Key Env Variables Notes:
    - env.observation_space.n: Total number of states (use env.grid_size^2 * 2)
    - env.action_space.n: Total number of actions (7 actions: 4 movement + 3 shooting)
    - env.grid_size: Total number of rows in the grid
'''
print("Starting Non-Stationary Environment")

Starting Non-Stationary Environment


In [7]:
optimal_policy_pi, optimal_valueFn_pi,outer_num_iterations_pi, inner_num_iterations_pi, calls_to_getTransisions_fn_pi = policyIterationAlgo(logEnabled = False, degrade_pitch = True)

print(f"The policy iteration converged after {outer_num_iterations_pi} outer_iterations, {inner_num_iterations_pi} inner_iterations and {calls_to_getTransisions_fn_pi} calls to getTransisions_fn")

Starting Policy Iteration Algorithm
Total number of possible states : 800
Total actions :  7
Shape of policy matrix (40, 800) and shape of valueFn (41, 800)
Errors in policy_state : 27976
Errors in policy_state : 19954
Errors in policy_state : 10058
Errors in policy_state : 9558
Errors in policy_state : 4290
Errors in policy_state : 2194
Errors in policy_state : 770
Errors in policy_state : 164
Errors in policy_state : 80
Errors in policy_state : 94
Errors in policy_state : 72
Errors in policy_state : 80
Errors in policy_state : 64
Errors in policy_state : 58
Errors in policy_state : 22
Errors in policy_state : 6
Errors in policy_state : 0
Starting rollout from position: (0, 10, 0)

Reached terminal state.
Episode GIF saved to ./output_seeds/PIOutputNonStationary.gif
Count of total number of calls made to the  env.get_transitions_at_time is :  4896000
The policy iteration converged after 17 outer_iterations, 34 inner_iterations and 4896000 calls to getTransisions_fn


In [8]:
# 1. Value Iteration
optimal_policy_vi, optimal_valueFn_vi, outer_num_iterations_vi, inner_num_iterations_vi, calls_to_getTransisions_fn_vi = valueIterationAlgo(degrade_pitch = True)

print(f"The value iteration converged after {outer_num_iterations_vi} outer_iterations, {inner_num_iterations_vi} inner_iterations and {calls_to_getTransisions_fn_vi} calls to getTransisions_fn")

Total number of possible states : 800
Total actions :  7
Shape of policy matrix (40, 800) and shape of valueFn (41, 800)
maxAbsDiff after iteration : 1 is 58.159054415340094
maxAbsDiff after iteration : 2 is 48.24391990791517
maxAbsDiff after iteration : 3 is 40.54074611531525
maxAbsDiff after iteration : 4 is 34.34997255728222
maxAbsDiff after iteration : 5 is 30.683464366389167
maxAbsDiff after iteration : 6 is 27.20732636057118
maxAbsDiff after iteration : 7 is 24.49930090017895
maxAbsDiff after iteration : 8 is 21.980702029483925
maxAbsDiff after iteration : 9 is 19.63736419354079
maxAbsDiff after iteration : 10 is 17.461641814066684
maxAbsDiff after iteration : 11 is 15.44768847508903
maxAbsDiff after iteration : 12 is 13.607094683275507
maxAbsDiff after iteration : 13 is 12.091565672317323
maxAbsDiff after iteration : 14 is 10.704881897902904
maxAbsDiff after iteration : 15 is 9.438049069973257
maxAbsDiff after iteration : 16 is 8.283154986323453
maxAbsDiff after iteration : 17 i

In [None]:
# Q2 

# The policy iteration converged after 17 outer_iterations, 34 inner_iterations and 4896000 calls to getTransisions_fn
# The value iteration converged after 1 outer_iterations, 41 inner_iterations and 9408000 calls to getTransisions_fn

In [11]:
# Q3 :: For comparison implement your usual Value Iteration(without time as a state), this can be done by making
# calls to get transition at time function without specifying the time step and maintaing a Value function
# that depends only on the state. What do you observe?

optimal_policy_vi_ts_independent, optimal_valueFn_vi_ts_independent, outer_num_iterations_vi_ts_independent, inner_num_iterations_vi_ts_independent, calls_to_getTransisions_fn_vi_ts_independent = valueIterationAlgo(
    degrade_pitch = True, passTimeStamp = False
)

print(f"The value iteration converged after {outer_num_iterations_vi_ts_independent} outer_iterations, {inner_num_iterations_vi_ts_independent} inner_iterations and {calls_to_getTransisions_fn_vi_ts_independent} calls to getTransisions_fn")

Total number of possible states : 800
Total actions :  7
Shape of policy matrix (40, 800) and shape of valueFn (41, 800)
maxAbsDiff after iteration : 1 is 50.7
maxAbsDiff after iteration : 2 is 27.739999999999995
maxAbsDiff after iteration : 3 is 17.665249999999997
maxAbsDiff after iteration : 4 is 13.400933699999996
maxAbsDiff after iteration : 5 is 10.119844048999997
maxAbsDiff after iteration : 6 is 7.835763851439996
maxAbsDiff after iteration : 7 is 6.385366082079196
maxAbsDiff after iteration : 8 is 5.648465156826261
maxAbsDiff after iteration : 9 is 5.207060304803921
maxAbsDiff after iteration : 10 is 4.651942631606094
maxAbsDiff after iteration : 11 is 4.13356312585775
maxAbsDiff after iteration : 12 is 3.6367333065307963
maxAbsDiff after iteration : 13 is 3.2098650709601073
maxAbsDiff after iteration : 14 is 2.9810147762157144
maxAbsDiff after iteration : 15 is 2.7686709623298933
maxAbsDiff after iteration : 16 is 2.5537037041816673
maxAbsDiff after iteration : 17 is 2.31449884

In [14]:
# are the 2 policies identical :: optimal_policy_vi and optimal_policy_pi
def comparePolicies(policy1, policy2, model = model):
    cnt_of_dissimarlities = 0
    model = model()

    for timeStamp in range(model.non_stationary_horizon):
        for i in range(model.maxStateSize):
            if policy1[timeStamp][i] != policy2[timeStamp][i]:
                cnt_of_dissimarlities += 1

    print("Count of dissimilarities :", cnt_of_dissimarlities)

# comparePolicies(optimal_policy_vi, optimal_policy_pi)
comparePolicies(optimal_policy_vi, optimal_policy_vi_ts_independent)

Count of dissimilarities : 5454


In [None]:
# Q3
# 1. After implementing the value iteration, we observed that both were bound to converge after 40 iterations 
# :: should be bcz we obtained as many equations as the no of variables
# total_horizon * no_of_states equations
# Because information can only move one time-layer per outer iteration, it takes:

# iteration 1 → makes V_{H-1} correct (uses V_H = 0),
# iteration 2 → makes V_{H-2} correct (now sees the correct V_{H-1}),
# …
# iteration H → finally makes V_0 correct.

# So with H=40, you naturally get ~40 iterations before delta hits the threshold.
# 2. The optimal policy obtained in both cases had dissimmilarities 
# Stationary VI is cheaper in state-space size, but will generally produce a (possibly) suboptimal policy in a non-stationary environment. 
# Non-stationary VI produces better policies but at extra computational cost roughly proportional to H.

In [30]:
# Q4
def getEpisodeRewards(env, obs, policy, model):
    episode_reward = 0
    timeStamp = 0
    while True:
        if timeStamp >= model.non_stationary_horizon:
            break
        
        state_index = env.state_to_index(obs)
        action = policy[timeStamp][state_index]
        
        if action == -1:       
            print("No valid action found!")
            break
        
        obs, reward, done, _, _ = env.step(action)
        episode_reward += reward
        timeStamp += 1
        
        if done:
            break
    return episode_reward

def getEpisodesRewardMean(envClass, episodeCount, policy, model, seeds):
    if seeds is None:
        seeds = np.random.randint(0, 30, size=episodeCount)
    total_rewards = []

    for seed in seeds:
        env = envClass(render_mode=None, degrade_pitch=True)
        obs, _ = env.reset(seed= int(seed))
        
        # try to evaluate the episode starting from obs
        total_rewards.append(getEpisodeRewards(env, obs, policy, model))

    mean_reward = np.mean(total_rewards)
    variance_reward = np.var(total_rewards)
    
    return mean_reward, variance_reward

In [31]:
# 4. Evaluate the performance of each policy by running 20 episodes with different seeds
# Given the optimal policies for both VI and PI
def compareMeanAndVarianceOfRewards(policy1, policy2, envClass=FootballSkillsEnv, model=model):
    
    model = model()
    seeds = np.random.randint(0, 30, size=model.episodeCount)
    
    meanPI, variancePI = getEpisodesRewardMean(envClass, model.episodeCount, policy1, model, seeds)
    meanVI, varianceVI = getEpisodesRewardMean(envClass, model.episodeCount, policy2, model, seeds)

    print("Value Iteration passing time stamp :: Mean : ", meanPI, " Variance : ", variancePI)
    print("Value Iteration without passing time stamp :: Mean : ", meanVI, " Variance : ", varianceVI)
    # Deterministic env → variance = 0 after convergence.
    
    if(meanPI == meanVI and variancePI == varianceVI):
        print("mean and variance of policies are same")
    else:
        print("mean and variance of policies are not same")

In [29]:
#compareMeanAndVarianceOfRewards(optimal_policy_vi, optimal_policy_pi)
compareMeanAndVarianceOfRewards(optimal_policy_vi, optimal_policy_vi_ts_independent)

Value Iteration passing time stamp :: Mean :  53.8  Variance :  796.26
Value Iteration without passing time stamp :: Mean :  25.8  Variance :  2212.46
mean and variance of policies are not same
