Documentation\
[Basic Usage - Gymnasium Documentation](https://gymnasium.farama.org/content/basic_usage/)\
[Cart Pole - Gymnasium Documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/#cart-pole)\
[Cart Pole Control Environment in OpenAI Gym (Gymnasium)](https://aleksandarhaber.com/cart-pole-control-environment-in-openai-gym-gymnasium-introduction-to-openai-gym/)\
[Cartpole with Q-Learning Algorithm](https://aleksandarhaber.com/q-learning-in-python-with-tests-in-cart-pole-openai-gym-environment-reinforcement-learning-tutorial/)


In [1]:
# dependencies
!pip install gymnasium 
!pip install --upgrade setuptools
!pip install ez_setup
!pip install tensorflow
!pip install matplotlib
!pip install pygame
!pip install tqdm



In [47]:
# imports
from tqdm import tqdm
import pygame
import gymnasium as gym
import tensorflow as tf
import numpy as np
import random
import matplotlib
import math
import glob
import io
import base64
from IPython import display
import time

# display plots below code
%matplotlib inline

# Set error logging level to 'ERROR'
# logger.set_level(40)

In [61]:
# generate our environment
env = gym.make("CartPole-v1", render_mode="human")
# env = gym.make("CartPole-v1")

# basic info
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
print(env.metadata)
print(env.render_mode)
print(env.reward_range)
print(env.spec)

Discrete(2)
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
{'render_modes': ['human', 'rgb_array'], 'render_fps': 50}
human
(-inf, inf)
EnvSpec(id='CartPole-v1', entry_point='gymnasium.envs.classic_control.cartpole:CartPoleEnv', reward_threshold=475.0, nondeterministic=False, max_episode_steps=500, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'render_mode': 'human'}, namespace=None, name='CartPole', version=1, additional_wrappers=(), vector_entry_point='gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv')


running one step

In [71]:
# running one step
env = gym.make("CartPole-v1", render_mode="human")
observation, info = env.reset() # initial state
# observation: cart position, cart velocity, pole angle, pole angular velocity

STEPS = 1

for _ in (range(STEPS)):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    print(f"action: {action}") # 0: push cart left, 1: push cart right
    observation, reward, terminated, truncated, info = env.step(action)
    print(f"cart position: {observation[0]}")
    print(f"cart velocity: {observation[1]}")
    print(f"pole angle: {observation[2]}")
    print(f"pole angular velocity: {observation[3]}")
    print(f"reward: {reward}")
    print(f"terminated: {terminated}")
    print(f"truncated: {truncated}")
env.close()

action: 1
cart position: -0.01957808807492256
cart velocity: 0.2389117181301117
pole angle: -0.022795235738158226
pole angular velocity: -0.30435463786125183
reward: 1.0
terminated: False
truncated: False


running one episode

In [72]:
EPISODE_COUNT = 1
STEP_COUNT = 500

for episode in range(EPISODE_COUNT):
    observation, info = env.reset() # initial state
    total_reward = []
    total_step = 0
    for step in range(STEP_COUNT):
        random_action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        total_step += 1
        total_reward.append(reward)
        print(f"episode {episode}, step {step}, action: {action}, observation: {observation}")
        if terminated or truncated:
            print(f"Summary: total step: {total_step}, total reward: {np.sum(total_reward)}")
            observation, info = env.reset()
            break
env.close()

error: display Surface quit

running 10 episodes

In [57]:
EPISODE_COUNT = 10
STEP_COUNT = 500
env = gym.make("CartPole-v1", render_mode="human")

for episode in range(EPISODE_COUNT):
    observation, info = env.reset() # initial state
    total_reward = []
    total_step = 0
    env.render()
    for step in range(STEP_COUNT):
        random_action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        total_step += 1
        total_reward.append(reward)
        print(f"episode {episode}, step {step}, action: {action}, observation: {observation}")
        time.sleep(0.05)
        if terminated or truncated:
            print(f"Summary: total step: {total_step}, total reward: {np.sum(total_reward)}")
            observation, info = env.reset()
            time.sleep(1)
            break
env.close()

episode 0, step 0, action: 0, observation: [-0.04537159 -0.23059562 -0.04624301  0.24467452]
episode 0, step 1, action: 0, observation: [-0.0499835  -0.4250276  -0.04134952  0.52242017]
episode 0, step 2, action: 0, observation: [-0.05848405 -0.6195439  -0.03090111  0.8017919 ]
episode 0, step 3, action: 0, observation: [-0.07087493 -0.8142288  -0.01486528  1.0845963 ]
episode 0, step 4, action: 0, observation: [-0.08715951 -1.0091515   0.00682665  1.3725778 ]
episode 0, step 5, action: 0, observation: [-0.10734253 -1.2043581   0.03427821  1.667388  ]
episode 0, step 6, action: 0, observation: [-0.1314297  -1.3998615   0.06762596  1.9705466 ]
episode 0, step 7, action: 0, observation: [-0.15942693 -1.595629    0.1070369   2.2833934 ]
episode 0, step 8, action: 0, observation: [-0.19133951 -1.7915672   0.15270476  2.6070275 ]
episode 0, step 9, action: 0, observation: [-0.22717085 -1.987505    0.20484532  2.942235  ]
episode 0, step 10, action: 0, observation: [-0.26692095 -2.1831727   