In [1]:
#!pip install gym
#!pip install pygame
#!pip install gymnasium

In [2]:
import numpy as np

def generate_possible_states(bins):
    """
    Generate all possible states in the environment by discretizing the state space.
    """
    num_bins = [len(b) + 1 for b in bins]  # Number of bins for each dimension
    possible_states = {}

    for i in bins[0]:
        for j in bins[1]:
            for k in bins[2]:
                for l in bins[3]:
                    state = (i, j, k, l)
                    possible_states[state] = -1  # Initialize with None action
    return possible_states

def find_nearest_state(observation):
    """
    Find the nearest state in the possible states map based on the given observation.
    """
    min_distance = float('inf')
    nearest_state = None

    for state in possible_states.keys():
        distance = sum((observation[i] - state[i]) ** 2 for i in range(len(observation)))
        if distance < min_distance:
            min_distance = distance
            nearest_state = state

    return nearest_state

def store_best_action(state, action):
    """
    Store best action for a given state.
    """
    possible_states[find_nearest_state(state)] = action

# Define number of bins for each dimension of the observation space
# Adjust these values based on the range of observations you observe during training/testing
bins = [
    np.linspace(-4.8000002e+00, 4.8000002e+00, 50),  # Cart position
    np.linspace(-3.4028235e+38, 3.4028235e+38, 50),  # Cart velocity
    np.linspace(-4.1887903e-01, 4.1887903e-01, 50),  # Pole angle
    np.linspace(-3.4028235e+38, 3.4028235e+38, 50)  # Pole velocity at tip
]

# Generate all possible states
possible_states = generate_possible_states(bins)

# Example of storing best action for a state
# Suppose 'state' is the current state tuple and 'best_action' is the best action for that state
# store_best_action(state, best_action)

# Accessing the best action for a given state
# Suppose 'state' is the state tuple for which you want to access the best action
# best_action = possible_states[state]


In [3]:
import gymnasium as gym
import numpy as np

env = gym.make('CartPole-v1', render_mode="human")
obs, inf = env.reset()

num_episodes = 25

def run_episode(env, possible_states, Q):
    episode = []
    state, _ = env.reset()
    done = False

    while not done:
        discretized_state = find_nearest_state(state)#discretize(state, bins)
        action = possible_states[discretized_state]
        if action == -1:
            action = env.action_space.sample()  # Choose a random action if the state has not been visited yet
        next_state, reward, done, _, _ = env.step(action)
        episode.append((discretized_state, action, reward))
        state = next_state

    return episode

def update_action_values(episode, Q, returns_sum, returns_count):
    G = 0
    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        G += reward
        if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
            returns_sum[state][action] += G
            returns_count[state][action] += 1
            Q[state][action] = returns_sum[state][action] / returns_count[state][action]

def improve_policy(Q, possible_states):
    for state, actions in Q.items():
        if state in possible_states:
            possible_states[state] = np.argmax(actions)

# Initialize action-value function Q(s, a) and returns counters
Q = {state: [0] * env.action_space.n for state in possible_states.keys()}
returns_sum = {state: [0] * env.action_space.n for state in possible_states.keys()}
returns_count = {state: [0] * env.action_space.n for state in possible_states.keys()}

# Run episodes and update Q-values
for ep in range(num_episodes):
    print(f"Episode number is {ep+1}")

    episode = run_episode(env, possible_states, Q)
    update_action_values(episode, Q, returns_sum, returns_count)

# Improve policy based on Q-values
improve_policy(Q, possible_states)

env.close()

Episode number is 1
Episode number is 2
Episode number is 3
Episode number is 4
Episode number is 5
Episode number is 6
Episode number is 7
Episode number is 8
Episode number is 9
Episode number is 10
Episode number is 11
Episode number is 12
Episode number is 13
Episode number is 14
Episode number is 15
Episode number is 16
Episode number is 17
Episode number is 18
Episode number is 19
Episode number is 20
Episode number is 21
Episode number is 22
Episode number is 23
Episode number is 24
Episode number is 25


In [4]:
import gymnasium as gym
import numpy as np

env = gym.make('CartPole-v1', render_mode="human")
obs, inf = env.reset()
env.render()

total_reward = 0

while True:
    discretized_state = find_nearest_state(obs)#discretize(state, bins)
    action = possible_states[discretized_state]
    if action == -1:
        action = env.action_space.sample()  # Choose a random action if the state has not been visited yet
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    env.render()
    if terminated or truncated:
        print(f"Pole angle at episode ends: {np.degrees(obs[2])}", end="\n")
        break
print(f"reward in this episode: {total_reward}", end="\n")

env.close()

Pole angle at episode ends: -12.503689765930176
reward in this episode: 10.0


In [5]:
discretized_state = find_nearest_state(obs)
discretized_state

(-4.8000002, -6.944537755102045e+36, -0.41887903, -6.944537755102045e+36)

In [6]:
Q[(0.0, 0.0, -0.209439515, 0.0)]

KeyError: (0.0, 0.0, -0.209439515, 0.0)

In [None]:
returns_sum[(0.0, 0.0, -0.209439515, 0.0)]

: 

In [None]:
possible_states[(0.0, 0.0, -0.209439515, 0.0)]

: 