In [2]:
import numpy as np

# Constants
alpha = 0.17
n = 35
MAX_ITERATION = 1000
num_actions = 4 
num_states = 9

# Initialize Q-table
q_table = np.full((9, 9, 9, 9, num_actions), 1000)

# Initial inventory levels for all actors
inventory_levels = [12, 12, 12, 12]

# Provided data
customer_demand = [15,10,8,14,9,3,13,2,13,11,3,4,6,11,15,12,15,4,12,3,13,10,15,15,3,11,1,13,10,10,0,0,8,0,14]
lead_times = [2,0,2,4,4,4,0,2,4,1,1,0,0,1,1,0,1,1,2,1,1,1,4,2,2,1,4,3,4,1,4,0,3,3,4]




In [3]:
def code_state(inventory):
    if inventory < -6:
        return 1
    elif inventory < -3:
        return 2
    elif inventory < 0:
        return 3
    elif inventory < 3:
        return 4
    elif inventory < 6:
        return 5
    elif inventory < 10:
        return 6
    elif inventory < 15:
        return 7
    elif inventory < 20:
        return 8
    else:
        return 9

def calculate_reward(state):
    holding_costs = [max(0, inv) for inv in state]
    backorder_costs = [max(0, -inv) for inv in state]
    return -sum(holding_costs) - 2 * sum(backorder_costs)

def get_next_state_vector(current_state, action):
    new_inventories = [current_state[i] - customer_demand[t] + action[i] for i in range(4)]
    return new_inventories





In [4]:
Iteration = 0
t = 0

states_over_time = []

while Iteration <= MAX_ITERATION:
    S = inventory_levels.copy()
    coded_S = [code_state(s) for s in S]
    start_prob = 0.98 - (0.88 * Iteration / MAX_ITERATION)
    
    while t < n:
        states_over_time.append(coded_S.copy())
        
        prob_exploitation = start_prob - ((start_prob - 0.02) * t / n)
        if np.random.rand() < prob_exploitation:
            actions = [np.random.choice(num_actions) for _ in range(4)]
        else:
            actions = [np.argmax(q_table[coded_S[0], coded_S[1], coded_S[2], coded_S[3], :]) for _ in range(4)]
        
        reward = calculate_reward(S)
        next_state = get_next_state_vector(S, actions)
        coded_next_state = [code_state(s) for s in next_state]
        
        for i in range(4):
            q_table[coded_S[0], coded_S[1], coded_S[2], coded_S[3], actions[i]] += alpha * (reward + np.max(q_table[coded_next_state[0], coded_next_state[1], coded_next_state[2], coded_next_state[3], :]) - q_table[coded_S[0], coded_S[1], coded_S[2], coded_S[3], actions[i]])
        
        S = next_state
        coded_S = coded_next_state
        t += 1
    
    t = 0
    Iteration += 1

policies = []
for t in range(n):
    state_t = states_over_time[t]
    policy_t = [np.argmax(q_table[state_t[0], state_t[1], state_t[2], state_t[3], i]) for i in range(4)]
    policies.append(policy_t)

for t, policy_t in enumerate(policies):
    print(f"Period {t}: Retailer: {policy_t[0]}, Distributor: {policy_t[1]}, Manufacturer: {policy_t[2]}, Supplier: {policy_t[3]}")

Period 0: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 1: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 2: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 3: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 4: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 5: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 6: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 7: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 8: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 9: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 10: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 11: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 12: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 13: Retailer: 0, Distributor: 0, Manufacturer: 0, Supplier: 0
Period 14: Retailer: 0, Distributor: 0, Manu