In [1]:
import gymnasium as gym
import sklearn
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

In [2]:
transition_probabilities = [
    [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
    [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
    [None, [0.8, 0.1, 0.1], None]
]
rewards = [
    [[+10, 0, 0], [0, 0, 0], [0, 0, 0]],
    [[0, 0, 0], [0, 0, 0], [0, 0, -50]],
    [[0, 0, 0], [+40, 0, 0], [0, 0, 0]]
]
possible_actions = [[0, 1, 2], [0, 2], [1]]

In [3]:
env = gym.make('CartPole-v1')
input_shape = [4]
n_outputs = 2

model = keras.Sequential([
    keras.layers.Dense(32, activation='elu', input_shape=input_shape),
    keras.layers.Dense(32, activation='elu'),
    keras.layers.Dense(n_outputs)
])

In [4]:
np.random.seed(42)
Q_values = np.full((3, 3), -np.inf)
for state, action in enumerate(possible_actions):
    Q_values[state, action] = 0.0

In [5]:
def step(state, action):
    probas = transition_probabilities[state][action]
    next_state = np.random.choice([0, 1, 2], p=probas)
    reward = rewards[state][action][next_state]
    return next_state, reward

def exploration_policy(state):
    return np.random.choice(possible_actions[state])

In [6]:
alpha0 = 0.05  #  initial learning rate
decay = 0.05  #  learning rate decay
gamma = 0.9  #  discount factor
state = 0  #  initial state

for iteration in range(10000):
    action = exploration_policy(state)
    next_state , reward = step(state, action)
    next_value = np.max(Q_values[next_state])
    alpha = alpha0 / (1 + iteration * decay)
    Q_values[state, action] *= alpha
    Q_values[state, action] += alpha * (reward + gamma * next_value)
    state = next_state

In [7]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values:np.ndarray = model.predict(state[np.newaxis], verbose=0)[0]
        return Q_values.argmax()

In [None]:
from collections import deque

replay_buffer = deque(maxlen=2000)

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    return [
        np.array([experience[field_index] for experience in batch]
                 for field_index in range(6))
    ] # [states, actions, rewards, next_states, terminateds, truncateds]

In [None]:
def play_one_step(env:gym.Env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, terminated, truncated, info = env.step(action)
    replay_buffer.append((state, action, reward, next_state, terminated, truncated))
    return next_state, reward, terminated, truncated, info

In [None]:
env.reset(seed=42)
np.random.seed(42)
tf.random.set_seed(42)
rewards = []
best_score = 0

In [None]:
batch_size = 32
discount_factor = 0.95
optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-2)
loss_fn = tf.keras.losses.mean_squared_error

def training_step(batch_size):
    (states, actions, rewards, next_states, terminateds, 
     truncateds) = sample_experiences(batch_size)
    next_Q_values = model.predict(next_state, verbose=0)
    