In [None]:
import numpy as np

In [6]:
class Environment:
    def __init__(self, num_states, num_actions, terminal_states, stochastic=False):
        self.num_states = num_states
        self.num_actions = num_actions
        self.terminal_states = terminal_states
        self.stochastic = stochastic
        self.transitions = np.zeros((num_states, num_actions, num_states), dtype=float)
        self.rewards = np.zeros((num_states, num_actions), dtype=float)

    def set_transitions(self, transition_matrix):
        self.transitions = transition_matrix

    def set_rewards(self, reward_matrix):
        self.rewards = reward_matrix

    def transition(self, state, action):
        if self.stochastic:
            probabilities = self.transitions[state, action, :]
            next_state = np.random.choice(self.num_states, p=probabilities)
        else:
            next_state = self.transitions[state, action, :].argmax()

        return next_state, self.rewards[state, action]

class QLearning:
    def __init__(self, env, alpha=0.1, gamma=0.99, epsilon=0.1, num_episodes=1000):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.num_episodes = num_episodes
        self.q_table = np.zeros((self.env.num_states, self.env.num_actions), dtype=float)

    def train(self):
        for episode in range(self.num_episodes):
            state = np.random.choice([s for s in range(self.env.num_states) if s not in self.env.terminal_states])
            done = False

            while not done:
                if np.random.rand() < self.epsilon:
                    action = np.random.choice(range(self.env.num_actions))
                else:
                    action = np.argmax(self.q_table[state, :])

                next_state, reward = self.env.transition(state, action)

                q_next = np.max(self.q_table[next_state, :])
                td_target = reward + self.gamma * q_next
                td_error = td_target - self.q_table[state, action]

                self.q_table[state, action] += self.alpha * td_error

                if next_state in self.env.terminal_states:
                    done = True
                else:
                    state = next_state

            
            print(f"Episode {episode + 1} - Q-table:")
            print(self.q_table[::10, :])

### Create the environment and Q-learning instances for 

In [8]:
# Create the environment
num_states = 100
num_actions = 4
terminal_states = [0, 99]
stochastic = False   # set True for stochastic case/multi state transition
                        # False for deterministic case/single state transition

env = Environment(num_states, num_actions, terminal_states, stochastic=stochastic)

# Define the transition matrix and reward matrix
transition_matrix = np.zeros((num_states, num_actions, num_states), dtype=float)
reward_matrix = np.zeros((num_states, num_actions), dtype=float)

for s in range(num_states):
    for a in range(num_actions):
        if s == 0:
            transition_matrix[s, a, s] = 1.0
        elif s == 99:
            transition_matrix[s, a, s] = 1.0
        else:
            if a == 0: # Move up
                if s >= 10:
                    next_state = s - 10
                else:
                    next_state = s
            elif a == 1: # Move right
                if s % 10 != 9:
                    next_state = s + 1
                else:
                    next_state = s
            elif a == 2: # Move down
                if s < 90:
                    next_state = s + 10
                else:
                    next_state = s
            elif a == 3: # Move left
                if s % 10 != 0:
                    next_state = s - 1
                else:
                    next_state = s

            transition_matrix[s, a, next_state] = 1.0
            reward_matrix[s, a] = -1.0

env.set_transitions(transition_matrix)
env.set_rewards(reward_matrix)

# Create the Q-learning agent and train it
alpha = 0.1
gamma = 0.99
epsilon = 0.1
num_episodes = 10

q_learning = QLearning(env, alpha=alpha, gamma=gamma, epsilon=epsilon, num_episodes=num_episodes)
q_learning.train()


Episode 1 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 2 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 3 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 4 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 5 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 6 - Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]