In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class CartPoleEnv:
    def __init__(self):
        # State: [x, x_velocity, theta, theta_velocity]
        # Action: 0 left, 1 right
        self.state = np.random.uniform(low=-0.05, high=0.05, size=(4,))
        self.g = 9.8
        self.m = 0.1
        self.M = 1.0
        self.L = 0.5
        self.dt = 0.02
        
    def step(self, action):
        x, x_velocity, theta, theta_velocity = self.state
        
        force = 20.0 if action == 1 else -20.0
        
        cos_theta = np.cos(theta)
        sin_theta = np.sin(theta)
        total_mass = self.M + self.m
        pole_mass_length = self.m * self.L
        
        temp = (force + pole_mass_length * theta_velocity ** 2 * sin_theta) / total_mass
        theta_acc = (self.g * sin_theta - cos_theta * temp) / (self.L * (4 / 3 - self.m * cos_theta ** 2 / total_mass))
        x_acc = temp - pole_mass_length * theta_acc * cos_theta / total_mass
        
        x += x_velocity * self.dt
        x_velocity += x_acc * self.dt
        theta += theta_velocity * self.dt
        theta_velocity += theta_acc * self.dt
        
        lose = x < -2.4 or x > 2.4 or theta < -np.pi / 15 or theta > np.pi / 15
        reward = 1.0 if not lose else 0.0
        
        self.state = np.array([x, x_velocity, theta, theta_velocity])
        
        return self.state, reward, lose

    def reset(self):
        self.state = np.random.uniform(low=-0.05, high=0.05, size=(4,))
        return self.state


In [3]:
class QLearningAgent:
    def __init__(self, action_space, state_space, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.action_space = action_space
        self.state_space = state_space
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros(state_space + (action_space,))
        self.state_bins = [
            np.linspace(-2.4, 2.4, 10), # x position
            np.linspace(-3.0, 3.0, 10), # x velocity
            np.linspace(-np.pi / 15, np.pi / 15, 10), # theta
            np.linspace(-3.0, 3.0, 10), # theta velocity
        ]

    def discretize_state(self, state):
        # Discretize the continuous state space into bins
        discretized = []
        for i in range(len(state)):
            discretized.append(np.digitize(state[i], self.state_bins[i]) - 1)
        
        return tuple(discretized)

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space)
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state + (best_next_action,)]
        td_error = td_target - self.q_table[state + (action,)]
        self.q_table[state + (action,)] += self.alpha * td_error


In [4]:
def train(agent, env, episodes, max_timesteps=200):
    rewards = []
    for episode in range(episodes):
        state = env.reset()
        state = agent.discretize_state(state)
        total_reward = 0
        
        for t in range(max_timesteps):
            action = agent.choose_action(state)
            next_state, reward, lose = env.step(action)
            next_state = agent.discretize_state(next_state)
            
            agent.update_q_table(state, action, reward, next_state)
            
            state = next_state
            total_reward += reward
            
            if lose: break
        
        rewards.append(total_reward)
        if episode % 500 == 0 or episode == episodes-1:
            print(f"Episode {episode}, Total Reward: {total_reward}")
    
    return rewards


In [5]:
env = CartPoleEnv()
agent = QLearningAgent(action_space=2, state_space=(10, 10, 10, 10))
rewards = train(agent, env, 10000)

Episode 0, Total Reward: 6.0
Episode 500, Total Reward: 8.0
Episode 1000, Total Reward: 38.0
Episode 1500, Total Reward: 75.0
Episode 2000, Total Reward: 122.0
Episode 2500, Total Reward: 117.0
Episode 3000, Total Reward: 75.0
Episode 3500, Total Reward: 200.0
Episode 4000, Total Reward: 172.0
Episode 4500, Total Reward: 90.0
Episode 5000, Total Reward: 143.0
Episode 5500, Total Reward: 61.0
Episode 6000, Total Reward: 126.0
Episode 6500, Total Reward: 200.0
Episode 7000, Total Reward: 200.0
Episode 7500, Total Reward: 200.0
Episode 8000, Total Reward: 200.0
Episode 8500, Total Reward: 200.0
Episode 9000, Total Reward: 158.0
Episode 9500, Total Reward: 200.0
Episode 9999, Total Reward: 200.0


In [6]:
def render(env):
    x, x_dot, theta, theta_dot = env.state
    
    cart_width = 0.4
    cart_height = 0.2
    pole_length = 0.5
    
    fig, ax = plt.subplots()
    
    ax.set_xlim(-2.5, 2.5)
    ax.set_ylim(-0.5, 1)
    
    ax.add_patch(plt.Rectangle((-cart_width / 2 + x, -cart_height / 2), cart_width, cart_height, color='blue'))
    
    pole_x = x + pole_length * np.sin(theta)
    pole_y = pole_length * np.cos(theta)
    ax.plot([x, pole_x], [0, pole_y], lw=5, color='red')

    plt.pause(0.02)

    plt.clf()

def evaluate(agent, env, episodes, is_render):
    for episode in range(episodes):
        state = env.reset()
        state = agent.discretize_state(state)
        lose = False
        total_reward = 0
        
        while not lose:
            action = np.argmax(agent.q_table[state])
            state, reward, lose = env.step(action)
            state = agent.discretize_state(state)
            total_reward += reward
            
            if is_render: render(env)
        
        print(f"Episode {episode}, Total Reward: {total_reward}")


evaluate(agent, env, 10, False)

Episode 0, Total Reward: 335.0
Episode 1, Total Reward: 4611.0
Episode 2, Total Reward: 629.0
Episode 3, Total Reward: 575.0
Episode 4, Total Reward: 22240.0
Episode 5, Total Reward: 5387.0
Episode 6, Total Reward: 636.0
Episode 7, Total Reward: 3175.0
Episode 8, Total Reward: 7030.0
Episode 9, Total Reward: 11094.0
