In [1]:
from mdp import *
import numpy as np
import matplotlib.pyplot as plt

### Approximate Q learning: Linear Q-function

In [6]:
# defining a linear Q function class
class LinearQ(QFunction):

    def __init__(self, mdp, features):
        self.mdp = mdp
        self.features = features
        # initialize weights to zero
        num_weights = self.features.get_num_actions() * self.features.get_num_features() 
        self.weights = np.zeros(shape=(num_weights)) 

        # initialize value table
        self.V = {}
        for state in (self.mdp.states):
            self.V[state] = 0.0

    # update the weights
    def update(self, state, action, old, delta):
        # extract features from state
        feature_values = np.array(self.features.extract(state, action))
        # update weights
        self.weights = self.weights + delta * feature_values


    # evaluate q function
    def evaluate(self, state, action):
        # extract features from state
        feature_values = np.array(self.features.extract(state, action))
        #print("feature values: ",feature_values)
        #print("weights: ",self.weights)
        # compute Q value
        Q = np.dot(feature_values, self.weights)
        return Q


    def evaluate_V(self, state):
        return self.V[state]

    def update_V_from_Q(self):
        for state in self.mdp.states:
            actions = self.mdp.get_actions(state)
            self.V[state] =  max([self.evaluate(state, action) for action in actions])



# defining a feature extractor class for gridworld problem (hand-engineered features)
class GridWorldFeatures:
    def __init__(self, mdp):
        self.mdp = mdp
        self.num_features = 3
        

    def get_num_features(self):
        return self.num_features    
 
 
    def get_num_actions(self):
        return len(self.mdp.get_actions())


    '''
        We will define three (normalized) features:
        1) x-distance from goal
        2) y-distance from goal
        3) manhattan distance from goal
    '''
    def extract(self, state, action):
        (xg, yg) = self.mdp.goal
        (x, y) = state
        e = 0.01  # small additive value for avoiding division by zero        

        feature_values = []
        for a in self.mdp.get_actions():
            if (a == action) and (state != self.mdp.exit):
                feature_values.append((x+e)/(xg+e))
                feature_values.append((y+e)/(yg+e))
                feature_values.append((abs(xg-x)+abs(yg-y)+e)/(xg+yg+e))
            else:
                feature_values += [0.0 for _ in range(self.num_features)]
        
        return feature_values        



In [7]:
# instantiate grid world mdp
gw = GridWorld(discount_factor=0.9, withQTable=False)

# instantiate feature extractor
features = GridWorldFeatures(gw)

# instantiate linear q function object
qfunction = LinearQ(gw, features)

# instantiate Q learner
QL = QLearner(gw, qfunction, epsilon=0.1, alpha=0.1)


In [9]:
# training
episode_rewards = QL.train(episodes=100)

# policy extraction
pi = qfunction.extract_policy(gw)

print("-----------------------")
for y in range(gw.height-1, -1, -1):
    for x in range(gw.width):
        if (x,y) in pi:
            print(f"{pi[(x,y)]:<6}", end=' ')
        else:
            print(f"{'None':<6}", end=' ')
    print("")       
print("-----------------------")

#plot_rewards(episode_rewards, window_size=100)

Episode# 0, length: 23, accumulated reward: -0.09847709021836118
-----------------------
-0.00 -0.00 -0.01 -0.61 
 0.00  0.00 -0.01 -0.58 
 0.00 -0.00 -0.01 -0.01 
-----------------------
Episode# 1, length: 327, accumulated reward: -1.2107600349290264e-15
-----------------------
 0.00 -0.00 -0.00 -0.67 
 0.00  0.00 -0.00 -0.63 
 0.00  0.00 -0.00 -0.00 
-----------------------
Episode# 2, length: 174, accumulated reward: -1.2132607080039802e-08
-----------------------
 0.00 -0.00 -0.00 -0.73 
 0.00  0.00 -0.00 -0.68 
 0.00 -0.00 -0.00 -0.00 
-----------------------
Episode# 3, length: 18, accumulated reward: -0.16677181699666577
-----------------------
 0.00 -0.00 -0.00 -0.78 
 0.00  0.00 -0.00 -0.72 
 0.00 -0.00 -0.00 -0.00 
-----------------------
Episode# 4, length: 191, accumulated reward: -2.023376927644849e-09
-----------------------
 0.00  0.00 -0.00 -0.82 
 0.00  0.00 -0.00 -0.76 
 0.00  0.00 -0.00 -0.00 
-----------------------
Episode# 5, length: 70, accumulated reward: -0.00