In [72]:
import gymnasium as gym
import numpy as np
import random
import matplotlib.pyplot as plt
from sympy.strategies.branch import do_one

n_actions = 2
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1
epochs = 1000

theta_bins = [-12, -6, -1, 0, 1, 6, 12]  # θ: 0, ±1, ±6, ±12
x_bins = [-2.4, -0.8, 0.8, 2.4]  # x: ±0.8, ±2.4 
theta_dot_bins = [-np.inf, -50, 50, np.inf]  # θ̇: ±50, inf
x_dot_bins = [-np.inf, -0.5, 0.5, np.inf]  # ẋ: ±0.5, inf

# Q table
Q = np.zeros((len(theta_bins)-1, len(x_bins)-1, len(theta_dot_bins)-1,
              len(x_dot_bins)-1, n_actions))

In [78]:
def discretize_state(state):
    theta, x, theta_dot, x_dot = state
    theta_idx = np.digitize(theta, theta_bins) - 1
    x_idx = np.digitize(x, x_bins) - 1
    theta_dot_idx = np.digitize(theta_dot, theta_dot_bins) - 1
    x_dot_idx = np.digitize(x_dot, x_dot_bins) - 1
    
    return theta_idx, x_idx, theta_dot_idx, x_dot_idx

def isTerminal(state):
    theta, x, theta_dot, x_dot = state
    if abs(theta) > 12 or abs(x) > 12 or abs(theta_dot) > 50 or abs(x_dot) > 0.5:
        return True
    else:
        return False

def train():
    rewards = []
    for episode in range(epochs):
        state,_ = env.reset()
        state = discretize_state(state)
        done = False
        total_reward = 0

        while not done:
            # epsilon-greedy
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  
            else:
                action = np.argmax(Q[state])
            
            force = 10 if action == 1 else -10
            next_state, reward, _, _, _ = env.step(action)
            print(next_state)
            next_state = discretize_state(next_state)
            done = isTerminal(next_state)
            print(next_state)
            print(f"State: {next_state}, Done: {done}")

            
            # update Q
            Q[state][action] = Q[state][action] + learning_rate * (
                reward + discount_factor * np.max(Q[next_state]) - Q[state][action]
            )

            state = next_state
            total_reward += reward

        rewards.append(total_reward)

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode+1}/{epochs}, Total reward: {total_reward}")

    return rewards


def test():
    state,_ = env.reset()
    if isinstance(state, dict):
        state = state['state']
    state = discretize_state(state)
    done = False
    total_reward = 0

    while not done:
        action = np.argmax(Q[state])  
        force = 10 if action == 1 else -10
        next_state, reward, done, _, _ = env.step(action)
        if isinstance(next_state, dict):
            next_state = next_state['state']
        next_state = discretize_state(next_state)
        state = next_state
        total_reward += reward

    print(f"Test Total Reward: {total_reward}")

In [81]:
env = gym.make('CartPole-v0')
rewards = train()
test()

[ 0.03797851  0.15521954  0.03466388 -0.2383551 ]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[-0.00555813 -0.210916    0.0429128   0.28776333]
(2, 1, 1, 1)
State: (2, 1, 1, 1), Done: True
[ 0.0014359   0.19307685 -0.04166295 -0.353933  ]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[-0.01639628 -0.1898988  -0.04704295  0.3024738 ]
(2, 1, 1, 1)
State: (2, 1, 1, 1), Done: True
[ 0.00512977  0.22553292 -0.0324299  -0.26334247]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[-0.02466192 -0.1551399  -0.02728353  0.31659913]
(2, 1, 1, 1)
State: (2, 1, 1, 1), Done: True
[ 0.02995795 -0.2313473   0.0164676   0.28677163]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[ 0.00225388 -0.21430466  0.00646736  0.29172486]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[ 0.02417525  0.18286827  0.02772548 -0.3136677 ]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[ 0.01443711  0.22514297  0.03436741 -0.30409533]
(3, 1, 1, 1)
State: (3, 1, 1, 1), Done: True
[-0.02501059  0.21076605 -0.03522937 -0.27261177]


In [54]:
print(env.observation_space)
env.close()

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [4]:
plt.plot(rewards)
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.title('Q-Learning Performance on CartPole')
plt.show()

NameError: name 'rewards' is not defined