## Preparing the Environment

In [20]:
from __future__ import print_function
import numpy as np

from ifqi import envs
from ifqi.evaluation import evaluation
from ifqi.algorithms.fqi.FQI import FQI

from scipy.optimize import curve_fit

In [22]:
mdp = envs.LQG1D()
state_dim, action_dim, reward_dim = envs.get_space_info(mdp)
reward_idx = state_dim + action_dim
discrete_actions = np.array([-8, -7, -6, -5, -4, -3, -2.5, -2, -1.5, -1, -.75,
                             -.5, -.25, 0, .25, .5, .75, 1, 1.5, 2, 2.5, 3, 4,
                             5, 6, 7, 8])
dataset = evaluation.collect_episodes(mdp, n_episodes=1000)
sast = np.append(dataset[:, :reward_idx],
                 dataset[:, reward_idx + reward_dim:-1],
                 axis=1)
r = dataset[:, reward_idx]

## Learning with FQI and `curve_fit`

In [56]:
class Regressor:
    __slots__ = ('b', 'k')
    
    def __init__(self, b=0, k=0):
        self.b = b
        self.k = k
    
    def fit(self, X, y):
        #print("fitting: ", end='')
        (self.b, self.k), pcov = curve_fit(self.Q, X, y)
        #print(self.b, self.k)
    
    def Q(self, sa, b, k):
        return b - (sa[:, 1] - k *  sa[:, 0]) ** 2
    
    def predict(self, X):
        return self.Q(X, self.b, self.k)

regressor = Regressor()

In [57]:
fqi = FQI(estimator=regressor,
          state_dim=state_dim,
          action_dim=action_dim,
          discrete_actions=discrete_actions,
          gamma=mdp.gamma,
          horizon=mdp.horizon,
          scaled=False,
          features=None,
          verbose=False)

In [58]:
fqi.fit(sast, r)

In [72]:
initial_states = np.array([[1, 2, 5, 7, 10]]).T

In [73]:
values = evaluation.evaluate_policy(mdp, fqi, initial_states=initial_states)
print(values)

(-9275429.4350569006, 151094.93579911912, 100.0, 0.0)


## Trying to evaluate Optimal Policy

In [52]:
K = mdp.computeOptimalK()[0][0]
K

-0.61525124566301148

In [70]:
class OptimalPolicy:
    
    def draw_action(self, states, absorbing, evaluation=False):
        i = np.abs(discrete_actions - K*states).argmin()
        print("states: {} action: {}".format(states, discrete_actions[i]))
        return discrete_actions[i]
    
optimalP = OptimalPolicy()


In [74]:
values = evaluation.evaluate_policy(mdp, optimalP, initial_states=initial_states)
print(values)

states: [1] action: -0.5
states: [ 0.32472139] action: -0.25
states: [ 0.14214703] action: 0.0
states: [ 0.15411965] action: 0.0
states: [ 0.1393763] action: 0.0
states: [ 0.03213931] action: 0.0
states: [-0.04679476] action: 0.0
states: [-0.07813658] action: 0.0
states: [-0.1748237] action: 0.0
states: [-0.07231758] action: 0.0
states: [ 0.12716277] action: 0.0
states: [ 0.17728669] action: 0.0
states: [ 0.16302205] action: 0.0
states: [ 0.18884852] action: 0.0
states: [ 0.17621097] action: 0.0
states: [ 0.0029495] action: 0.0
states: [ 0.04509096] action: 0.0
states: [ 0.21153789] action: -0.25
states: [-0.25968819] action: 0.25
states: [ 0.10232925] action: 0.0
states: [ 0.07298774] action: 0.0
states: [ 0.00166169] action: 0.0
states: [ 0.07476873] action: 0.0
states: [-0.0484426] action: 0.0
states: [-0.075783] action: 0.0
states: [-0.1259018] action: 0.0
states: [-0.09048142] action: 0.0
states: [-0.12078195] action: 0.0
states: [-0.04905816] action: 0.0
states: [-0.00873777] act