In [9]:
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import scipy as sp
import tensorflow as tf
import platform
import numpy as np
import numpy.random as random

import gym
from PyQt5.QtCore.QProcess import state

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Input

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print(f"gymnasium Version: {gymnasium.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
print(f"numpy {np.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-15.1.1-arm64-arm-64bit
Tensor Flow Version: 2.16.2
Keras Version: 3.6.0
gymnasium Version: 0.28.1

Python 3.9.19 (main, May  6 2024, 14:39:30) 
[Clang 14.0.6 ]
Pandas 2.0.3
Scikit-Learn 1.0.2
SciPy 1.10.1
numpy 1.24.0
GPU is available


In [2]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt

env = gym.make('CartPole-v1')

n_actions = 2
n_states = 24  
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1
epochs = 1000

theta_bins = [-12, -6, -1, 0, 1, 6, 12]  # θ: 0, ±1, ±6, ±12
x_bins = [-2.4, -0.8, 0.8, 2.4]  # x: ±0.8, ±2.4 
theta_dot_bins = [-50, 50]  # θ̇: ±50, inf
x_dot_bins = [-0.5, 0.5]  # ẋ: ±0.5, inf

# Q table
Q = np.zeros((len(theta_bins)-1, len(x_bins)-1, len(theta_dot_bins)-1,
              len(x_dot_bins)-1, n_actions))

def discretize_state(state):
    theta, x, theta_dot, x_dot = state
    theta_idx = np.digitize(theta, theta_bins) - 1
    x_idx = np.digitize(x, x_bins) - 1
    theta_dot_idx = np.digitize(theta_dot, theta_dot_bins) - 1
    x_dot_idx = np.digitize(x_dot, x_dot_bins) - 1
    
    return (theta_idx, x_idx, theta_dot_idx, x_dot_idx)

def train():
    rewards = []
    for episode in range(epochs):
        state,_ = env.reset()
        if isinstance(state, dict):
            state = state['state']
        state = discretize_state(state)
        done = False
        total_reward = 0

        while not done:
            # epsilon-greedy
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  
            else:
                action = np.argmax(Q[state])
            
            force = 10 if action == 1 else -10
            next_state, reward, done, _, _ = env.step(force)
            if isinstance(next_state, dict):
                next_state = next_state['state']
            next_state = discretize_state(next_state)

            # update Q
            Q[state][action] = Q[state][action] + learning_rate * (
                reward + discount_factor * np.max(Q[next_state]) - Q[state][action]
            )

            state = next_state
            total_reward += reward

        rewards.append(total_reward)

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode+1}/{epochs}, Total reward: {total_reward}")

    return rewards

rewards = train()

plt.plot(rewards)
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.title('Q-Learning Performance on CartPole')
plt.show()

def test():
    state,_ = env.reset()
    if isinstance(state, dict):
        state = state['state']
    state = discretize_state(state)
    done = False
    total_reward = 0

    while not done:
        action = np.argmax(Q[state])  
        force = 10 if action == 1 else -10
        next_state, reward, done, _, _ = env.step(force)
        next_state, reward, done, _, _ = env.step(action)
        if isinstance(next_state, dict):
            next_state = next_state['state']
        next_state = discretize_state(next_state)
        state = next_state
        total_reward += reward

    print(f"Test Total Reward: {total_reward}")

test()

AssertionError: -10 (<class 'int'>) invalid