In [5]:
import random
import logging

import numpy as np
import gym

from tqdm import tqdm

In [27]:
###############################################################################
class QLearner:
    """Class for performing Q-Learning on OpenAI Gym Environments."""
    
    def __init__(self, env, alpha, gamma, epsilon, name=None, verbose=False):
        """Initializations"""
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.name = name if name else ''
        self.verbose = verbose
        
        self.num_actions = env.action_space.n
        self.num_states = env.observation_space.n
        self.Q = np.zeros((self.num_states, self.num_states))
        
        self.training_episode_total_rewards = []
        
        if self.verbose:
            print(f'{self.name}')
        
    def choose_action(self, state):
        """Epsilon greedy exploration vs exploitation strategy."""
        rand = random.uniform(0, 1)
        
        if rand > self.epsilon:
            action = np.argmax(self.Q[state, :])
        else:
            action = self.env.action_space.sample()
            
        return action
    
    def learn(self, s, a, r, s_):
        """Update Q table.
        
        Q[s, a] = Q[s, a] + alpha * (r[s, a] + gamma * max(Q[s_, a_]) - Q[s, a])
        """
        
        self.Q[s, a] += self.alpha * \
            (r + self.gamma * np.max(self.Q[s_, :]) - self.Q[s, a])
        
    def train(self, n_episodes):
        """Train agent."""
        for episode in range(n_episodes):
            state = env.reset()
            done = False
            episode_total_reward = 0
            
            if episode % 1000 == 0:
                print(episode)
            
            while not done:
                action = self.choose_action(state)
                state_, reward, done, info = env.step(action)
                
                self.learn(state, action, reward, state_)
                
                state = state_
                episode_total_reward += reward
                
            self.training_episode_total_rewards.append(episode_total_reward)
            
            
        
        

In [25]:
env = gym.make('Taxi-v2')
env.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|[35mB[0m: |
+---------+



In [29]:
q_learner = QLearner(env=env, alpha=0.5, gamma=0.9, epsilon=0.5, verbose=True)
# q_learner.train(10000)


