In [47]:
import numpy as np
import pandas as pd

In [48]:
# Load transition matrix and dataset
P_df = pd.read_csv("./dataset/P.csv", index_col=0).values
ts_df = pd.read_csv("./dataset/generated_data/ts_df.csv")


# Reshape into 3 action-specific 4x4 matrices
P = [P_df[i*4:(i+1)*4, :] for i in range(3)]  # P[0], P[1], P[2]

In [49]:
# Define reward function
def Reward(a, b, mode):
    R = np.zeros((4, 3))
    if mode == "reward":
        R[0, :] += 1
    else:
        R[3, :] -= 1
    R[:, 1] -= a
    R[:, 2] -= b
    return R

In [50]:
# ------------------------
# Value Iteration with Q(s,a)
# ------------------------
def value_iteration_with_q(P, R, gamma=0.95, theta=1e-4):
    Q = np.zeros((4, 3))  # Q(s, a)

    for _ in range(1000):
        delta = 0
        Q_new = np.copy(Q)
        for s in range(4):
            for a in range(3):
                Q_new[s, a] = R[s, a] + gamma * np.dot(P[a][s], np.max(Q, axis=1))
                delta = max(delta, abs(Q_new[s, a] - Q[s, a]))
        Q = Q_new
        if delta < theta:
            break

    V = np.max(Q, axis=1)
    policy = np.argmax(Q, axis=1)
    return Q, V, policy

In [51]:
def evaluate_policy(policy, R, ts_df):
    rewards = []
    current_reward = 0

    for _, row in ts_df.iterrows():
                
        s = int(row['cluster'])
        a = policy[s]
        current_reward += R[s, a]

        # Check if this row ends a trajectory
        if np.isnan(row['cluster_n']):
            rewards.append(current_reward)
            current_reward = 0

    # Catch any last incomplete trajectory
    if current_reward > 0:
        rewards.append(current_reward)

    # Print and return mean
    mean_reward = np.mean(rewards)
    print(f"Mean Accumulated Reward per Trajectory: {mean_reward:.3f}")
    return mean_reward


In [52]:
# ------------------------
# Run for both modes
# ------------------------

a, b = 0.01, 0.025

gamma = 0.95

results = {}

for mode in ["reward", "penalty"]:
    R = Reward(a, b, mode)
    Q, V, policy = value_iteration_with_q(P, R, gamma=gamma)
    mean_reward = evaluate_policy(policy, R, ts_df)

    results[mode] = {
        "Q": Q,
        "V": V,
        "policy": policy,
        "mean_reward": mean_reward
    }

Mean Accumulated Reward per Trajectory: 1.758
Mean Accumulated Reward per Trajectory: -1.647


In [53]:
# ------------------------
# Print Results
# ------------------------
for mode in ["reward", "penalty"]:
    print(f"\n====== {mode.upper()} MODE ======")
    print("Q-function (state-action values):")
    print(np.round(results[mode]["Q"], 3))
    print("V-function (state values):")
    print(np.round(results[mode]["V"], 3))
    print("Policy (state → best action):")
    for s in range(4):
        print(f"  State {s} → Action {results[mode]['policy'][s]}")
    print(f"Mean Reward: {results[mode]['mean_reward']:.3f}")


Q-function (state-action values):
[[7.556 7.529 7.597]
 [6.364 6.371 6.258]
 [6.389 6.38  6.427]
 [6.33  6.311 6.322]]
V-function (state values):
[7.597 6.371 6.427 6.33 ]
Policy (state → best action):
  State 0 → Action 2
  State 1 → Action 1
  State 2 → Action 2
  State 3 → Action 0
Mean Reward: 1.758

Q-function (state-action values):
[[-3.69  -3.73  -3.685]
 [-3.776 -3.763 -3.852]
 [-3.694 -3.758 -3.748]
 [-4.899 -4.906 -5.006]]
V-function (state values):
[-3.685 -3.763 -3.694 -4.899]
Policy (state → best action):
  State 0 → Action 2
  State 1 → Action 1
  State 2 → Action 0
  State 3 → Action 0
Mean Reward: -1.647
