In [9]:
import numpy as np
import random as rd

In [49]:
class env:
    def __init__(self):
        self.nS = 3
        self.nA = 3
        self.reward = np.matrix([[1,0,0],[0.5,0,0.5],[0.5,0.5,0]])
        self.P = {0: np.matrix([[0.8,0.1,0.1],[0.5,0,0.5],[0.2,0.4,0.4]]), 1: np.matrix([[0.3,0.3,0.4],[0.2,0.1,0.7],[0.4,0.4,0.2]]), 2: np.matrix([[0.1,0.5,0.4],[0.3,0.5,0.2],[0,0.1,0.9]])}
    
    def offline_set(self, N, k):
        #we use T_i = k in this setting
        offline_set = {}
        for i in range(N):
            result = []
            for j in range(k):
                state = rd.randint(0,self.nS-1)
                action = rd.randint(0,self.nA-1)
                reward = self.reward[state, action]
                result.append(state)
                result.append(action)
                result.append(reward)
            result.append(rd.randint(0,self.nS-1))
            offline_set[i] = result
        return offline_set
        

In [95]:
#set up the environment
env1 = env()
#create offline dataset
data = env1.offline_set(50,100)

In [96]:
def Fitted_Q_iteration(data, nS, nA, N,theta=0.00001, discount_factor=0.5):
    """
    Find nearly-optimal policy via FQI, details can be found in chapter.
    
    Args:
        data: A dictionary, with key represent the index of trajectory, each trajectory is a list;
        nS: The number of states in the environment;
        nA: The number of actions in the environment;
        N: The number of trajectories collected;
        theta: We stop iteration once our Q function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        A tuple (policy, Q) of the nearly-optimal policy and the corresponding estimated Q function.
    """
    Q_1 = np.zeros([nS,nA])
    Q = []
    Q.append(Q_1)
    
    for i in range(N):
        index = 0
        Q_2 = np.zeros([nS,nA])
        for j in range(len(data[i])//3):
            Q_2[data[i][index],data[i][index+1]] = data[i][index+2] + discount_factor * Q[len(Q)-1][data[i][index+3],np.argmax(Q[len(Q)-1][data[i][index+3],:])]
            index += 3
            Q.append(Q_2)
    
    Q_f = Q[len(Q)-1]
    
    policy = np.zeros([nS, nA])
    for s in range(nS):
        best_action = np.argmax(Q_f[s,:])
        policy[s, best_action] = 1.0
    
    return policy, Q_f
    
    

In [97]:
opt_policy, Q = Fitted_Q_iteration(data = data, nS = env1.nS, nA = env1.nA, N = 10)

In [98]:
opt_policy

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [99]:
Q

array([[1.70699044, 0.67674761, 0.58199044],
       [1.20699044, 0.66398088, 1.35349522],
       [1.12369402, 1.35349522, 0.57796175]])