In [91]:
import gym 
from custom_envs.windy_gridworld import WindyGridworldEnv
import numpy as np
from tqdm import tqdm
import time
from sklearn.tree import DecisionTreeRegressor
from function_approximators.replay import ReplayBuffer
import torch
from sklearn.utils.validation import check_is_fitted

env = WindyGridworldEnv()

In [156]:

def act(env, model, s, epsilon, explore):
    if explore and np.random.random_sample() < epsilon:
        action = env.action_space.sample()
    else:       
        Q = [model.predict(np.concatenate([s_next, [i]],-1).reshape(1,-1)) for i in range(4)]
        action = np.argmax(Q)
        action = env.action_space.sample()
    return action

def update(env, model, batch, gamma):
    inputs = np.concatenate([batch.states, batch.actions], -1)
    preds = []
    try:
        # check_is_fitted(model)
        for i in range(env.action_space.n):
            next_inputs = np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0], 1))+i], -1)
            preds.append(model.predict(next_inputs))
        outputs = batch.rewards + gamma * (1-batch.done) * np.max(preds, 0)
    except:
        outputs = batch.rewards
        
    model.fit(inputs, outputs)

    # return q_loss


def play_episode(env, model, replay_buffer, batch_size, gamma, epsilon, explore, train, episode_length):
    s = env.reset()
    done = False
    episode_timesteps = 0
    episode_return = 0

    while not done:
        a = act(env, model, s, epsilon, explore=explore)
        s_next, r, done, _ = env.step(a)
        if train:
            replay_buffer.push(
                np.array(s, dtype=np.float32),
                np.array([a], dtype=np.float32),
                np.array(s_next, dtype=np.float32),
                np.array([r], dtype=np.float32),
                np.array([done], dtype=np.float32),
                )
            if len(replay_buffer) >= batch_size:
                batch = replay_buffer.sample(batch_size)
                update(env, model, batch, gamma)
        episode_timesteps += 1
        episode_return += r
        
        if episode_timesteps == episode_length:
            break
        s = s_next

    return episode_timesteps, episode_return


In [157]:
model = DecisionTreeRegressor()
replay_buffer = ReplayBuffer(1000)


In [160]:
max_timesteps = 20000
timesteps_elapsed = 0
train = True
episode_length = 100
eval_freq = 1000
eval_episodes = 5
gamma = 0.99
epsilon = 0.1
batch_size = 128

with tqdm(total=max_timesteps) as pbar:

    while timesteps_elapsed < max_timesteps:
        episode_timesteps, _ = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                            explore=True, train=True, episode_length=episode_length)
        timesteps_elapsed += episode_timesteps
        pbar.update(episode_timesteps)

        if timesteps_elapsed % eval_freq < episode_timesteps:
            eval_returns = 0
            for _ in range(eval_episodes):
                _ , episode_return = play_episode(env, model, replay_buffer, batch_size=batch_size, gamma=gamma, epsilon=epsilon, 
                                                explore=False, train=False, episode_length=episode_length)
                eval_returns += episode_return / eval_episodes

            pbar.write(f"Evaluation at timestep {timesteps_elapsed} returned a mean returns of {eval_returns}")
            pbar.write(f"Epsilon = {epsilon}")
            # pbar.write(f"Learning rate = {agent.model_optim.param_groups[0]['lr']}")


















  6%|▌         | 1100/20000 [00:01<00:36, 523.29it/s]Evaluation at timestep 1000 returned a mean returns of -100.0
Epsilon = 0.1
 10%|█         | 2100/20000 [00:03<00:31, 576.28it/s]Evaluation at timestep 2000 returned a mean returns of -100.0
Epsilon = 0.1
 16%|█▌        | 3100/20000 [00:04<00:28, 595.35it/s]Evaluation at timestep 3000 returned a mean returns of -100.0
Epsilon = 0.1
 21%|██        | 4162/20000 [00:05<00:27, 579.77it/s]Evaluation at timestep 4062 returned a mean returns of -100.0
Epsilon = 0.1
 26%|██▌       | 5132/20000 [00:07<00:24, 609.04it/s]Evaluation at timestep 5032 returned a mean returns of -100.0
Epsilon = 0.1
 31%|███       | 6132/20000 [00:08<00:23, 590.12it/s]Evaluation at timestep 6032 returned a mean returns of -100.0
Epsilon = 0.1
 36%|███▌      | 7132/20000 [00:10<00:22, 564.28it/s]Evaluation at timestep 7032 returned a mean returns of -100.0
Epsilon = 0.1
 41%|████      | 8132/20000 [00:11<00:20, 590.96it/s]Evaluation at timestep 8032 returned a mean 

In [108]:
batch = replay_buffer.sample(32)
inputs = np.concatenate([batch.states, batch.actions], -1)
outputs = batch.rewards
model.fit(inputs, outputs)

preds = []
for i in range(4):
    next_inputs = np.concatenate([batch.next_states, np.zeros((batch.actions.size()[0],1))+i], -1)
    preds.append(model.predict(next_inputs))
a = batch.rewards + gamma*np.max(preds, 0)
a


tensor([[-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900],
        [-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900],
        [-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900],
        ...,
        [-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900],
        [-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900],
        [-1.9900, -1.9900, -1.9900,  ..., -1.9900, -1.9900, -1.9900]],
       dtype=torch.float64)

In [155]:
a=1
s_next, r, done, _ = env.step(1)
Q = [model.predict(np.concatenate([s_next, [i]],-1).reshape(1,-1)) for i in range(4)]
np.argmax(Q)
# model.predict(l)


3

In [149]:
# batch = replay_buffer.sample(4)
# inputs = np.concatenate([batch.states, batch.actions], -1)
model.predict(inputs)


array([-1., -1., -1., -1.])

In [148]:
model.fit(inputs, batch.rewards)

DecisionTreeRegressor()