In [2]:
import numpy as np
from scipy.stats import poisson

In [15]:
#Example 4.2 - Jack's Car Rental
ACTIONS = np.arange(-5, 6)
MAX_CARS_LOC = 20
N_CARS_PER_LOC = (MAX_CARS_LOC + 1)    #[0, 20]
N_STATES = N_CARS_PER_LOC * N_CARS_PER_LOC
CAR_RENT_COST = 10
CAR_MOVE_COST = -2

LAMBDA_RET_LOC1 = 3
LAMBDA_RET_LOC2 = 2
LAMBDA_RNT_LOC1 = 3
LAMBDA_RNT_LOC2 = 4

MAX_POISSON_OUTCOME = 10
GAMMA = 0.9
THETA = 1e-4

In [None]:
def calculate_expected_val(state, action): #R(s, a) | P(s' |s, a)
    n1_end_prev, n2_end_prev = divmod(state, N_CARS_PER_LOC)

    expected_total_reward = 0.0
    n1_after_move = n1_end_prev - action
    n2_after_move = n2_end_prev + action

    next_state_prob = np.zeros(N_STATES)

    n1_start_day = min(max(0, n1_after_move), MAX_CARS_LOC)
    n2_start_day = min(max(0, n2_after_move), MAX_CARS_LOC)

    move_cost = abs(action) * CAR_MOVE_COST

    for req1 in range(MAX_POISSON_OUTCOME + 1):
        prob_req1 = poisson.pmf(req1, LAMBDA_RNT_LOC1)
        for req2 in range(MAX_POISSON_OUTCOME + 1):
            prob_req2 = poisson.pmf(req2, LAMBDA_RNT_LOC2)
            for ret1 in range(MAX_POISSON_OUTCOME + 1):
                prob_ret1 = poisson.pmf(ret1, LAMBDA_RET_LOC1)
                for ret2 in range(MAX_POISSON_OUTCOME + 1):
                    prob_ret2 = poisson.pmf(ret2, LAMBDA_RET_LOC2)

                    prob_joint_outcome = prob_req1 * prob_req2 * prob_ret1 * prob_ret2

                    # Calculate actual rentals
                    rented_1 = min(n1_start_day, req1)
                    rented_2 = min(n2_start_day, req2)
                    
                    # Calculate immediate reward
                    rental_income = (rented_1 + rented_2) * CAR_RENT_COST
                    current_reward = rental_income - move_cost
                    
                    # Add to expected reward (weighted by probability of this outcome)
                    expected_total_reward += prob_joint_outcome * current_reward

                    # Calculate next state (number of cars at end of day)
                    # Cars remaining after rentals (before returns are added to inventory for next day)
                    cars_after_rentals_1 = n1_start_day - rented_1
                    cars_after_rentals_2 = n2_start_day - rented_2

                    # New inventory for next day's start (capped at MAX_CARS)
                    # Returns become available for the *next* day.
                    next_n1 = min(MAX_CARS_LOC, cars_after_rentals_1 + ret1)
                    next_n2 = min(MAX_CARS_LOC, cars_after_rentals_2 + ret2)
                    
                    next_s_idx = next_n1 * N_CARS_PER_LOC + next_n2
                    next_state_prob[next_s_idx] += prob_joint_outcome

    sum_probs = np.sum(next_state_prob)
    if sum_probs > 0:
        next_state_prob /= sum_probs

    return expected_total_reward, next_state_prob

