In [73]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [74]:
pos_space = np.linspace(-2.4, 2.4, 10)
vel_space = np.linspace(-3.4028235e+38, 3.4028235e+38, 11)
ang_space = np.linspace(-.2095, .2095, 10)
ang_vel_space = np.linspace(-3.4028235e+38, 3.4028235e+38, 11)
#pos_space = np.linspace(-2.4, 2.4, 12)
#vel_space = np.linspace(-4, 4, 12)
#ang_space = np.linspace(-.2095, .2095, 12)
#ang_vel_space = np.linspace(-4, 4, 12)

In [75]:
class MonteCarloAgent:
    def __init__(self, num_actions, gamma=0.9, epsilon=0.1):
        self.num_actions = num_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = {}
        self.N = {}
        self.returns = {} 

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            
            return np.random.choice(self.num_actions)
        else:
            if state in self.Q:
                return np.argmax(self.Q[state])
            else:
                return np.random.choice(self.num_actions)

    def update(self, episode):
        G = 0  
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward 
            if (state, action) not in [(pair[0], pair[1]) for pair in episode[:t]]:
                
                if (state, action) in self.returns:
                    self.returns[(state, action)].append(G)
                else:
                    self.returns[(state, action)] = [G]
                if state not in self.Q:
                    self.Q.update({state: [0,0]})
                    self.N.update({state: [0,0]})
                self.Q[state][action] = np.mean(self.returns[(state, action)])  
                self.N[state][action] += 1  


    def train(self, env, num_episodes):
        for eps in range(num_episodes):
            episode = []
            state, _ = env.reset()
            done = False
            episode_reward = 0
            while not done:
                state = (np.digitize(state[0], pos_space), np.digitize(state[1], vel_space), np.digitize(state[2], ang_space), np.digitize(state[3], ang_vel_space))
                action = self.select_action(state)
                next_state, reward, done, _, _ = env.step(action)
                episode_reward += reward
                episode.append((state, action, reward))
                state = next_state
            print("Reward of episode #{} : {:.2f}".format(eps, episode_reward))
            self.update(episode)

env = gym.make('CartPole-v1')
num_actions = env.action_space.n

state, _ = env.reset()

agent = MonteCarloAgent(num_actions)
agent.train(env, num_episodes=10000)
env.close()


Reward of episode #0 : 17.00
Reward of episode #1 : 56.00
Reward of episode #2 : 12.00
Reward of episode #3 : 72.00
Reward of episode #4 : 17.00
Reward of episode #5 : 55.00
Reward of episode #6 : 41.00
Reward of episode #7 : 17.00
Reward of episode #8 : 125.00
Reward of episode #9 : 10.00
Reward of episode #10 : 85.00
Reward of episode #11 : 89.00
Reward of episode #12 : 62.00
Reward of episode #13 : 119.00
Reward of episode #14 : 82.00
Reward of episode #15 : 123.00
Reward of episode #16 : 174.00
Reward of episode #17 : 82.00
Reward of episode #18 : 63.00
Reward of episode #19 : 59.00
Reward of episode #20 : 132.00
Reward of episode #21 : 197.00
Reward of episode #22 : 383.00
Reward of episode #23 : 103.00
Reward of episode #24 : 68.00
Reward of episode #25 : 129.00
Reward of episode #26 : 142.00
Reward of episode #27 : 143.00
Reward of episode #28 : 90.00
Reward of episode #29 : 196.00
Reward of episode #30 : 73.00
Reward of episode #31 : 150.00
Reward of episode #32 : 170.00
Reward

In [76]:
def test_agent(agent, env, num_episodes=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        while not done:
            state = (np.digitize(state[0], pos_space), np.digitize(state[1], vel_space), np.digitize(state[2], ang_space), np.digitize(state[3], ang_vel_space))
            action = agent.select_action(state)
            next_state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_rewards += episode_reward
    average_reward = total_rewards / num_episodes
    print("Average reward over {} episodes: {:.2f}".format(num_episodes, average_reward))


env = gym.make('CartPole-v1',render_mode='human')
test_agent(agent, env)
env.close()


Average reward over 100 episodes: 203.95


# q-value algorithm


In [77]:
def Initialize_q_matrix(pos_space,vel_space,ang_space,ang_vel_space,env):
    q_matrix = np.zeros((len(pos_space)+1, len(vel_space)+1, len(ang_space)+1, len(ang_vel_space)+1, env.action_space.n))
    return q_matrix

In [81]:
def train_q_learning():
    env = gym.make('CartPole-v1')
    q_matrix=Initialize_q_matrix(pos_space,vel_space,ang_space,ang_vel_space,env)
    
    learning_rate = 0.1 
    discount_factor = 0.99 

    epsilon = 1         
    epsilon_decay_rate = 0.00001 
    rng = np.random.default_rng()   

    rewards_per_episode = []

    i = 0

    # for i in range(episodes):
    while(True):

        state = env.reset()[0]      
        state_p = np.digitize(state[0], pos_space)
        state_v = np.digitize(state[1], vel_space)
        state_a = np.digitize(state[2], ang_space)
        state_av = np.digitize(state[3], ang_vel_space)

        terminated = False          # True when reached goal

        rewards=0

        while(not terminated):

            if rng.random() < epsilon:
                # Choose random action  (0=go left, 1=go right)
                action = env.action_space.sample()
            else:
                action = np.argmax(q_matrix[state_p, state_v, state_a, state_av, :])

            new_state,reward,terminated,_,_ = env.step(action)
            new_state_p = np.digitize(new_state[0], pos_space)
            new_state_v = np.digitize(new_state[1], vel_space)
            new_state_a = np.digitize(new_state[2], ang_space)
            new_state_av= np.digitize(new_state[3], ang_vel_space)

            q_matrix[state_p, state_v, state_a, state_av, action] = q_matrix[state_p, state_v, state_a, state_av, action] + learning_rate * (
                    reward + discount_factor*np.max(q_matrix[new_state_p, new_state_v, new_state_a, new_state_av,:]) - q_matrix[state_p, state_v, state_a, state_av, action])
            state = new_state
            state_p = new_state_p
            state_v = new_state_v
            state_a = new_state_a
            state_av= new_state_av

            rewards+=reward


        rewards_per_episode.append(rewards)
        mean_rewards = np.mean(rewards_per_episode[len(rewards_per_episode)-100:])

        if  i%100==0:
            print(f'episode:{i} Epsilon: {epsilon:0.2f}  mean rewards {mean_rewards:0.1f}')

        if mean_rewards>=250:
            print("Mission done")
            break

        epsilon = max(epsilon - epsilon_decay_rate, 0)

        i+=1


    env.close()

    
    f = open('cartpole7.pkl','wb')
    pickle.dump(q_matrix, f)
    f.close()



    # run(is_training=True, render=False)

    #run(is_training=False, render=True)

In [79]:
def test(file_name):
    env = gym.make('CartPole-v1', render_mode='human')
    f = open(file_name, 'rb')
    q = pickle.load(f)
    f.close()
    terminated = False  


    state = env.reset()[0]      
    state_p = np.digitize(state[0], pos_space)
    state_v = np.digitize(state[1], vel_space)
    state_a = np.digitize(state[2], ang_space)
    state_av = np.digitize(state[3], ang_vel_space)
    rewards=0

    while(not terminated):


        action = np.argmax(q[state_p, state_v, state_a, state_av, :])

        new_state,reward,terminated,_,_ = env.step(action)
        new_state_p = np.digitize(new_state[0], pos_space)
        new_state_v = np.digitize(new_state[1], vel_space)
        new_state_a = np.digitize(new_state[2], ang_space)
        new_state_av= np.digitize(new_state[3], ang_vel_space)

        state = new_state
        state_p = new_state_p
        state_v = new_state_v
        state_a = new_state_a
        state_av= new_state_av

        rewards+=reward

            #print(reward)
        print(f'Rewards: {rewards}')




    

    env.close()

In [82]:
train_q_learning()

episode:0 Epsilon: 1.00  mean rewards 15.0
episode:100 Epsilon: 1.00  mean rewards 22.1
episode:200 Epsilon: 1.00  mean rewards 24.4
episode:300 Epsilon: 1.00  mean rewards 22.1
episode:400 Epsilon: 1.00  mean rewards 22.8
episode:500 Epsilon: 1.00  mean rewards 22.6
episode:600 Epsilon: 0.99  mean rewards 22.5
episode:700 Epsilon: 0.99  mean rewards 24.2
episode:800 Epsilon: 0.99  mean rewards 21.2
episode:900 Epsilon: 0.99  mean rewards 21.3
episode:1000 Epsilon: 0.99  mean rewards 22.1
episode:1100 Epsilon: 0.99  mean rewards 22.2
episode:1200 Epsilon: 0.99  mean rewards 23.3
episode:1300 Epsilon: 0.99  mean rewards 21.5
episode:1400 Epsilon: 0.99  mean rewards 22.9
episode:1500 Epsilon: 0.99  mean rewards 25.4
episode:1600 Epsilon: 0.98  mean rewards 20.9
episode:1700 Epsilon: 0.98  mean rewards 22.9
episode:1800 Epsilon: 0.98  mean rewards 21.8
episode:1900 Epsilon: 0.98  mean rewards 22.9
episode:2000 Epsilon: 0.98  mean rewards 23.8
episode:2100 Epsilon: 0.98  mean rewards 22.7


In [84]:
test('cartpole6.pkl')

Rewards: 1.0
Rewards: 2.0
Rewards: 3.0
Rewards: 4.0
Rewards: 5.0
Rewards: 6.0
Rewards: 7.0
Rewards: 8.0
Rewards: 9.0
Rewards: 10.0
Rewards: 11.0
Rewards: 12.0
Rewards: 13.0
Rewards: 14.0
Rewards: 15.0
Rewards: 16.0
Rewards: 17.0
Rewards: 18.0
Rewards: 19.0
Rewards: 20.0
Rewards: 21.0
Rewards: 22.0
Rewards: 23.0
Rewards: 24.0
Rewards: 25.0
Rewards: 26.0
Rewards: 27.0
Rewards: 28.0
Rewards: 29.0
Rewards: 30.0
Rewards: 31.0
Rewards: 32.0
Rewards: 33.0
Rewards: 34.0
Rewards: 35.0
Rewards: 36.0
Rewards: 37.0
Rewards: 38.0
Rewards: 39.0
Rewards: 40.0
Rewards: 41.0
Rewards: 42.0
Rewards: 43.0
Rewards: 44.0
Rewards: 45.0
Rewards: 46.0
Rewards: 47.0
Rewards: 48.0
Rewards: 49.0
Rewards: 50.0
Rewards: 51.0
Rewards: 52.0
Rewards: 53.0
Rewards: 54.0
Rewards: 55.0
Rewards: 56.0
Rewards: 57.0
Rewards: 58.0
Rewards: 59.0
Rewards: 60.0
Rewards: 61.0
Rewards: 62.0
Rewards: 63.0
Rewards: 64.0
Rewards: 65.0
Rewards: 66.0
Rewards: 67.0
Rewards: 68.0
Rewards: 69.0
Rewards: 70.0
Rewards: 71.0
Rewards: 72.0
R

## SARSA algo


In [85]:
def sarsa_train():

    env = gym.make('CartPole-v1')


   
    sarsa_table = np.zeros((len(pos_space)+1, len(vel_space)+1, len(ang_space)+1, len(ang_vel_space)+1, env.action_space.n)) 


    learning_rate_a = 0.1 
    discount_factor_g = 0.99 

    epsilon = 1         
    epsilon_decay_rate = 0.00001 
    rng = np.random.default_rng()   

    rewards_per_episode = []

    i = 0

    # for i in range(episodes):
    while(True):

        state = env.reset()[0]      
        state_p = np.digitize(state[0], pos_space)
        state_v = np.digitize(state[1], vel_space)
        state_a = np.digitize(state[2], ang_space)
        state_av = np.digitize(state[3], ang_vel_space)

        terminated = False          # True when reached goal

        rewards=0
        if  rng.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(sarsa_table[state_p, state_v, state_a, state_av, :])
        while(not terminated):



            new_state,reward,terminated,_,_ = env.step(action)
            new_state_p = np.digitize(new_state[0], pos_space)
            new_state_v = np.digitize(new_state[1], vel_space)
            new_state_a = np.digitize(new_state[2], ang_space)
            new_state_av= np.digitize(new_state[3], ang_vel_space)
            if rng.random() < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = np.argmax(sarsa_table[new_state_p, new_state_v, new_state_a, new_state_av, :])

            sarsa_table[state_p, state_v, state_a, state_av, action] = sarsa_table[state_p, state_v, state_a, state_av, action] + learning_rate_a * (
                    reward + discount_factor_g*(sarsa_table[new_state_p, new_state_v, new_state_a, new_state_av,next_action]) - sarsa_table[state_p, state_v, state_a, state_av, action])

            state = new_state
            state_p = new_state_p
            state_v = new_state_v
            state_a = new_state_a
            state_av= new_state_av
            action = next_action
            rewards+=reward


        rewards_per_episode.append(rewards)
        mean_rewards = np.mean(rewards_per_episode[len(rewards_per_episode)-100:])

        if i%100==0:
            print(f'Episode: {i}  Epsilon: {epsilon:0.2f}  Mean Rewards {mean_rewards:0.1f}')

        if mean_rewards>250:
            print("Mission completed")
            break

        epsilon = max(epsilon - epsilon_decay_rate, 0)

        i+=1

    env.close()



    f = open('cartpole_sarsa3.pkl','wb')
    pickle.dump(sarsa_table, f)
    f.close()



    # run(is_training=True, render=False)

    #run(is_training=False, render=True)

In [86]:
sarsa_train()

Episode: 0  Epsilon: 1.00  Mean Rewards 10.0
Episode: 100  Epsilon: 1.00  Mean Rewards 22.2
Episode: 200  Epsilon: 1.00  Mean Rewards 21.6
Episode: 300  Epsilon: 1.00  Mean Rewards 22.9
Episode: 400  Epsilon: 1.00  Mean Rewards 21.5
Episode: 500  Epsilon: 1.00  Mean Rewards 25.3
Episode: 600  Epsilon: 0.99  Mean Rewards 20.9
Episode: 700  Epsilon: 0.99  Mean Rewards 24.1
Episode: 800  Epsilon: 0.99  Mean Rewards 25.1
Episode: 900  Epsilon: 0.99  Mean Rewards 24.6
Episode: 1000  Epsilon: 0.99  Mean Rewards 23.3
Episode: 1100  Epsilon: 0.99  Mean Rewards 21.3
Episode: 1200  Epsilon: 0.99  Mean Rewards 24.0
Episode: 1300  Epsilon: 0.99  Mean Rewards 22.5
Episode: 1400  Epsilon: 0.99  Mean Rewards 22.2
Episode: 1500  Epsilon: 0.99  Mean Rewards 21.0
Episode: 1600  Epsilon: 0.98  Mean Rewards 23.2
Episode: 1700  Epsilon: 0.98  Mean Rewards 22.1
Episode: 1800  Epsilon: 0.98  Mean Rewards 22.5
Episode: 1900  Epsilon: 0.98  Mean Rewards 21.3
Episode: 2000  Epsilon: 0.98  Mean Rewards 23.3
Epis

In [87]:
test('cartpole_sarsa3.pkl')

Rewards: 1.0
Rewards: 2.0
Rewards: 3.0
Rewards: 4.0
Rewards: 5.0
Rewards: 6.0
Rewards: 7.0
Rewards: 8.0
Rewards: 9.0
Rewards: 10.0
Rewards: 11.0
Rewards: 12.0
Rewards: 13.0
Rewards: 14.0
Rewards: 15.0
Rewards: 16.0
Rewards: 17.0
Rewards: 18.0
Rewards: 19.0
Rewards: 20.0
Rewards: 21.0
Rewards: 22.0
Rewards: 23.0
Rewards: 24.0
Rewards: 25.0
Rewards: 26.0
Rewards: 27.0
Rewards: 28.0
Rewards: 29.0
Rewards: 30.0
Rewards: 31.0
Rewards: 32.0
Rewards: 33.0
Rewards: 34.0
Rewards: 35.0
Rewards: 36.0
Rewards: 37.0
Rewards: 38.0
Rewards: 39.0
Rewards: 40.0
Rewards: 41.0
Rewards: 42.0
Rewards: 43.0
Rewards: 44.0
Rewards: 45.0
Rewards: 46.0
Rewards: 47.0
Rewards: 48.0
Rewards: 49.0
Rewards: 50.0
Rewards: 51.0
Rewards: 52.0
Rewards: 53.0
Rewards: 54.0
Rewards: 55.0
Rewards: 56.0
Rewards: 57.0
Rewards: 58.0
Rewards: 59.0
Rewards: 60.0
Rewards: 61.0
Rewards: 62.0
Rewards: 63.0
Rewards: 64.0
Rewards: 65.0
Rewards: 66.0
Rewards: 67.0
Rewards: 68.0
Rewards: 69.0
Rewards: 70.0
Rewards: 71.0
Rewards: 72.0
R