In [1]:
import gym 
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel

env = gym.make("CartPole-v1")

In [2]:
def act(env, epsilon, obs, X, alpha, explore):
    if (explore and np.random.random_sample() < epsilon):
        action = env.action_space.sample()
    else:        
        Q = [_predict(np.concatenate([obs, actions[i]],-1).reshape(1,-1), X, alpha) for i in range(env.action_space.n)]
        action = np.argmax(Q)
    return action    

def _predict(x, X, alpha):
    return alpha.T @ kernel(X,x)

def _inc_dim_v(v):
    return np.pad(v, ((0,1),(0,0)))

def _inc_dim_m(m):
    return np.pad(m, ((0,1),(0,1)))


In [28]:
env = gym.make("CartPole-v1")
epsilon = 0.9
actions =  [[1,0],[0,1]]
learning_rate = 0.1
gamma = 0.99

kernel = rbf_kernel
obs = env.reset()
action = env.action_space.sample()

sigma_0 = 1
x = np.concatenate([obs, actions[action]],-1).reshape(1,-1)
X = x
alpha = np.array([[1]])
C = np.array([[1]])
mew = 1
sigma = 1
r = 1
e = np.array([[1]])

n_eps = 1000

for i in range(n_eps):

    obs = env.reset()

    episode_returns = 0

    done = False
    while not done:

        next_obs, reward, done, _ = env.step(action)
        
        Q_values = [_predict(np.concatenate([next_obs, actions[i]],-1).reshape(1,-1), X, alpha) for i in range(env.action_space.n)]
        # print(f"Q_values: {Q_values}")
        Q_max = np.max(Q_values)
        # print(f"Q_max: {Q_max}")
        Q_prev = _predict(x, X, alpha).item()
        # print(f"Q_prev: {Q_prev}")
        
        x = np.concatenate([next_obs, actions[action]],-1).reshape(1,-1)
        # print(f"x: {x}")
        k = kernel(x, x)
        # print(f"k: {k}")
        kk = kernel(X, x)
        # print(f"kk: {kk}")
        # K = kernel(X)
        # print(f"K: {K}")
        
        mew = alpha.T @ kk
        # print(f"mew: {mew}")
        sigma = k + kk.T @ C @ kk
        # print(f"sigma: {sigma}")
        
        r = -1/(sigma_0**2 + sigma)
        # print(f"r: {r}")
        y = (reward + gamma * (1-done) * Q_max - Q_prev) / (sigma_0**2 + sigma)
        # print(f"y: {y}")
        e = np.vstack([[0], e])
        # print(f"e: {e}")
        s = _inc_dim_v(C@kk) + e
        # print(f"s: {s}")
        C = _inc_dim_m(C) + r*(s@s.T)
        # print(f"C: {C}")
        alpha = _inc_dim_v(alpha) + y*s  
        # print(f"alpha: {alpha}")
        
        X = np.vstack([X, x])
        # print(f"X: {X}")

        obs = next_obs
        action = act(env, epsilon, obs, X, alpha, explore=True)

        episode_returns += reward

    epsilon = epsilon*0.99
        
    print(f"Episode: {i}, Return: {episode_returns}, Epsilon: {epsilon}")

           




Episode: 0, Return: 31.0, Epsilon: 0.891
Episode: 1, Return: 12.0, Epsilon: 0.88209
Episode: 2, Return: 21.0, Epsilon: 0.8732691
Episode: 3, Return: 12.0, Epsilon: 0.8645364090000001
Episode: 4, Return: 49.0, Epsilon: 0.85589104491
Episode: 5, Return: 76.0, Epsilon: 0.8473321344609
Episode: 6, Return: 14.0, Epsilon: 0.8388588131162911
Episode: 7, Return: 26.0, Epsilon: 0.8304702249851281
Episode: 8, Return: 70.0, Epsilon: 0.8221655227352769
Episode: 9, Return: 23.0, Epsilon: 0.8139438675079241
Episode: 10, Return: 48.0, Epsilon: 0.8058044288328449
Episode: 11, Return: 18.0, Epsilon: 0.7977463845445164
Episode: 12, Return: 33.0, Epsilon: 0.7897689206990712
Episode: 13, Return: 15.0, Epsilon: 0.7818712314920805
Episode: 14, Return: 37.0, Epsilon: 0.7740525191771597
Episode: 15, Return: 26.0, Epsilon: 0.7663119939853881
Episode: 16, Return: 27.0, Epsilon: 0.7586488740455342
Episode: 17, Return: 29.0, Epsilon: 0.7510623853050789
Episode: 18, Return: 15.0, Epsilon: 0.7435517614520281
Episod

KeyboardInterrupt: 

In [58]:
np.zeros((1, env.observation_space.shape[0] + env.action_space.n))

array([[0., 0., 0., 0., 0., 0.]])