In [1]:
import numpy as np 
import gym   
import random
from collections import defaultdict

env = gym.make("Taxi-v3")
env.render()

+---------+
|[43mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [2]:
action_size = env.action_space.n
print("Action Size: ", action_size)

state_size = env.observation_space.n
print("State Size: ", state_size)

Action Size:  6
State Size:  500


# Q - Learning

In [6]:
class Q_Learning():
    
    def __init__(self, gamma = 0.618, learning_rate = 0.9, epsilon = 1, nepisodes = 50000):
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.nepisodes = nepisodes
        self.Q = np.zeros((state_size, action_size))

    def greedy_policy(self, state):
        return np.argmax(self.Q[state])

    def epsilon_greedy_policy(self, state):
        action = 0
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = self.greedy_policy(state)
        return action

    def offpolicy_control(self, test_episodes):
        for episode in range(self.nepisodes):
            state = env.reset()
            done = False
            total_rewards = 0
            
            while not done:
                action = self.epsilon_greedy_policy(state)
                next_state, reward, done, info = env.step(action)
                self.Q[state][action] = self.Q[state][action] + self.learning_rate*(reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action])
                state = next_state
            res = self.test_policy(test_episodes)
            if episode % 100 == 0:
                print(f'Episode: {episode} Success%: {res}')
            if res>70:
                print(f'Solved! Episode: {episode} Success%: {res}')
                return self.Q
        
        score = res/test_episodes
        print("Score: " + str(score))
        
        return self.Q
    
    def test_policy(self, n):
        
        success = 0
        for episode in range(n):
            state = env.reset()
            done = False
            while not done:
                action = self.greedy_policy(state)
                state, reward, done, info = env.step(action)
            if reward == 1:
                success += 1
        
        return success/n*100

In [None]:
a = Q_Learning()
Q = a.offpolicy_control(200)

Episode: 0 Success%: 0.0
Episode: 100 Success%: 0.0
Episode: 200 Success%: 0.0
Episode: 300 Success%: 0.0
Episode: 400 Success%: 0.0
Episode: 500 Success%: 0.0
Episode: 600 Success%: 0.0
Episode: 700 Success%: 0.0
Episode: 800 Success%: 0.0
Episode: 900 Success%: 0.0
Episode: 1000 Success%: 0.0
Episode: 1100 Success%: 0.0
Episode: 1200 Success%: 0.0
Episode: 1300 Success%: 0.0
Episode: 1400 Success%: 0.0
Episode: 1500 Success%: 0.0
Episode: 1600 Success%: 0.0
Episode: 1700 Success%: 0.0
Episode: 1800 Success%: 0.0
Episode: 1900 Success%: 0.0
Episode: 2000 Success%: 0.0
Episode: 2100 Success%: 0.0
Episode: 2200 Success%: 0.0
Episode: 2300 Success%: 0.0
Episode: 2400 Success%: 0.0
Episode: 2500 Success%: 0.0
Episode: 2600 Success%: 0.0
Episode: 2700 Success%: 0.0
Episode: 2800 Success%: 0.0
Episode: 2900 Success%: 0.0
Episode: 3000 Success%: 0.0
Episode: 3100 Success%: 0.0
Episode: 3200 Success%: 0.0
Episode: 3300 Success%: 0.0
Episode: 3400 Success%: 0.0
Episode: 3500 Success%: 0.0
Epis

Episode: 28700 Success%: 0.0
Episode: 28800 Success%: 0.0
Episode: 28900 Success%: 0.0
Episode: 29000 Success%: 0.0
Episode: 29100 Success%: 0.0
Episode: 29200 Success%: 0.0
Episode: 29300 Success%: 0.0
Episode: 29400 Success%: 0.0
Episode: 29500 Success%: 0.0
Episode: 29600 Success%: 0.0
Episode: 29700 Success%: 0.0
Episode: 29800 Success%: 0.0
Episode: 29900 Success%: 0.0
Episode: 30000 Success%: 0.0
Episode: 30100 Success%: 0.0
Episode: 30200 Success%: 0.0
Episode: 30300 Success%: 0.0
Episode: 30400 Success%: 0.0
Episode: 30500 Success%: 0.0
Episode: 30600 Success%: 0.0
Episode: 30700 Success%: 0.0
Episode: 30800 Success%: 0.0
Episode: 30900 Success%: 0.0
Episode: 31000 Success%: 0.0
Episode: 31100 Success%: 0.0
Episode: 31200 Success%: 0.0
Episode: 31300 Success%: 0.0
Episode: 31400 Success%: 0.0
Episode: 31500 Success%: 0.0
Episode: 31600 Success%: 0.0
Episode: 31700 Success%: 0.0
Episode: 31800 Success%: 0.0
Episode: 31900 Success%: 0.0
Episode: 32000 Success%: 0.0
Episode: 32100

In [None]:
a.test_policy(100)