In [1]:
import numpy as np 
import gym   
import random
from collections import defaultdict

env = gym.make("Taxi-v3")
env.render()

+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [2]:
action_size = env.action_space.n
print("Action Size: ", action_size)

state_size = env.observation_space.n
print("State Size: ", state_size)

Action Size:  6
State Size:  500


In [9]:
class Q_Learning():
    
    def __init__(self, gamma = 0.95, learning_rate = 1, epsilon = 0.0001, nepisodes = 10000):
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.nepisodes = nepisodes
        self.Q = np.zeros((state_size, action_size))

    def greedy_policy(self, state):
        return np.argmax(self.Q[state])

    def epsilon_greedy_policy(self, state):
        action = 0
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = self.greedy_policy(state)
        return action

    def offpolicy_control(self, test_episodes):
        
        for episode in range(self.nepisodes):
            state = env.reset()
            done = False
            total_rewards = 0
            
            while not done:
                action = self.epsilon_greedy_policy(state)
                next_state, reward, done, info = env.step(action)
                self.Q[state][action] = self.Q[state][action] + self.learning_rate*(reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action])
                state = next_state
            res = self.test_policy(test_episodes)
            if episode % 100 == 0:
                print(f'Episode: {episode} Success%: {res}')
            if res>70:
                print(f'Solved! Episode: {episode} Success%: {res}')
                return self.Q
        
        score = res/test_episodes
        print("Score: " + str(score))
        
        return self.Q
    
    def test_policy(self, n):
        
        success = 0
        for episode in range(n):
            state = env.reset()
            done = False
            while not done:
                action = self.greedy_policy(state)
                state, reward, done, info = env.step(action)
            if reward == 1:
                success += 1
        
        return success/n*100

In [None]:
a = Q_Learning()
Q = a.offpolicy_control(200)

Episode: 0 Success%: 0.0
Episode: 100 Success%: 0.0
Episode: 200 Success%: 0.0
Episode: 300 Success%: 0.0
Episode: 400 Success%: 0.0
