In [1]:
import numpy as np
import random as rd

In [2]:
class env:
    def __init__(self):
        self.nS = 3
        self.nA = 3
        self.reward = np.matrix([[1,0,0],[0.5,0,0.5],[0.5,0.5,0]])
        self.P = {0: np.matrix([[0.8,0.1,0.1],[0.5,0,0.5],[0.2,0.4,0.4]]), 1: np.matrix([[0.3,0.3,0.4],[0.2,0.1,0.7],[0.4,0.4,0.2]]), 2: np.matrix([[0.1,0.5,0.4],[0.3,0.5,0.2],[0,0.1,0.9]])}
    
    def offline_set(self, N, k):
        #we use T_i = k in this setting
        offline_set = {}
        for i in range(N):
            result = []
            for j in range(k):
                state = rd.randint(0,self.nS-1)
                action = rd.randint(0,self.nA-1)
                reward = self.reward[state, action]
                result.append(state)
                result.append(action)
                result.append(reward)
            result.append(rd.randint(0,self.nS-1))
            offline_set[i] = result
        return offline_set
        

We can find the offline dataset following a behavior policy that can be described as a [S,A] matrix as
[[1/3, 1/3, 1/3],
[1/3, 1/3, 1/3],
[1/3, 1/3, 1/3]].

In [24]:
def TD_batch(data, N, env, alpha, discount_factor=0.5, theta=0.00001):
    """
    TD learning (batch version) Algorithm.
    
    Args:
        data: Offline dataset;
        N: The number of trajectories;
        env: The environment;
        alpha: Learning rate;
        theta: We stop evaluation once our value function change is less than theta for all states;
        discount_factor: Gamma discount factor.
        
    Returns:
        V is the value function corresponding to the behavior policy.
        
    """
    V = np.zeros(env.nS)
    i = 0
    while True:
        delta = 0
        index = 0
        for j in range(len(data[i])//3):
            V[data[i][index]] += alpha * (data[i][index+2] + discount_factor * V[data[i][index+3]] - V[data[i][index]])
            delta = max(delta, alpha * (data[i][index+2] + discount_factor * V[data[i][index+3]] - V[data[i][index]]))
            index += 3
        i += 1
        if i == N-1:
            break
        elif delta <= theta:
            break
        return V
        

In [3]:
def policy_eval(policy, env, discount_factor=0.5, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: 
            env.P[s] is a matrix of with (i,j) represent P(j|s,i).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            v = 0
            for a, action_prob in enumerate(policy[s]):
                for j in range(env.nS):
                    v += action_prob * env.P[s][a,j] * (env.reward[s,a] + discount_factor * V[j])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [4]:
env1 = env()
b_policy = np.array([[1/3,1/3,1/3],[1/3,1/3,1/3],[1/3,1/3,1/3]])
V = policy_eval(b_policy, env1)

In [5]:
V

array([0.66666109, 0.66666297, 0.66666377])

In [81]:
data = env1.offline_set(10000,200)
V = TD_batch(data, 10000, env1, alpha = 0.1)

In [82]:
V

array([0.59077384, 0.68979004, 0.5832737 ])