In [5]:
import numpy as np 
import gym   
import random
from collections import defaultdict
import matplotlib.pyplot as plt

env = gym.make("Taxi-v3")
env.render()

+---------+
|[34;1mR[0m: | :[43m [0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [6]:
action_size = env.action_space.n
print("Action Size: ", action_size)

state_size = env.observation_space.n
print("State Size: ", state_size)

Action Size:  6
State Size:  500


In [15]:
class Sarsa_Agent():
    
    def __init__(self, gamma = 0.95, learning_rate = 10, epsilon = 0.002, nepisodes = 10000):
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.nepisodes = nepisodes
        self.Q = defaultdict(lambda: np.zeros(env.action_space.n)) 

    def greedy_policy(self, state):
        return np.argmax(self.Q[state])

    def epsilon_greedy_policy(self, state):
        action = 0
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = self.greedy_policy(state)
        return action

    def onpolicy_control(self):
        reslist = []
        for episode in range(self.nepisodes):
            state = env.reset()
            done = False
            action = self.epsilon_greedy_policy(state)
            while not done:
                next_state, reward, done, info = env.step(action)
                next_action = self.epsilon_greedy_policy(next_state)
                self.Q[state][action] = self.Q[state][action] + self.learning_rate*(reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action])
                state = next_state
                action = next_action
            res = self.test_policy(200)
            reslist.append(res)
            if episode % 100 == 0:
                print(f'Episode: {episode} Success%: {res}')
            if res>70:
                print(f'Solved! Episode: {episode} Success%: {res}')
                return self.Q
            
            
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.plot(np.arange(len(reslist)), reslist)
        plt.ylabel('Success')
        plt.xlabel('Episode #')
        plt.show()
        
        score = sum(reslist)/test_episodes
        print("Score: " + str(score))
        
        return self.Q
    
    def test_policy(self, n):
        success = 0
        for episode in range(n):
            state = env.reset()
            done = False
            while not done:
                action = self.greedy_policy(state)
                state, reward, done, info = env.step(action)
            if reward == 1:
                success += 1
        return success/n*100

In [None]:
a = Sarsa_Agent()
Q = a.onpolicy_control()
a.test_policy(100)

Episode: 0 Success%: 0.0


  self.Q[state][action] = self.Q[state][action] + self.learning_rate*(reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action])
  self.Q[state][action] = self.Q[state][action] + self.learning_rate*(reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action])


Episode: 100 Success%: 0.0
Episode: 200 Success%: 0.0
Episode: 300 Success%: 0.0
Episode: 400 Success%: 0.0
Episode: 500 Success%: 0.0
Episode: 600 Success%: 0.0
Episode: 700 Success%: 0.0
Episode: 800 Success%: 0.0
Episode: 900 Success%: 0.0
Episode: 1000 Success%: 0.0
Episode: 1100 Success%: 0.0
Episode: 1200 Success%: 0.0
Episode: 1300 Success%: 0.0
Episode: 1400 Success%: 0.0
Episode: 1500 Success%: 0.0
Episode: 1600 Success%: 0.0
