In [1]:
import random

import numpy as np

import torch
import torch.nn as nn

import matplotlib.pyplot as plt

from pyrl.environments import BetDiceRolling
from pyrl.agents import Agent
from pyrl.exp import Experiment

from pyrl.tools import  ContinousState, DiscreteAction
from pyrl.approximate.algorithms import deep_q_learning


from typing import List
from tqdm.notebook import tqdm

In [2]:
def random_policy(state: ContinousState, available_actions: List[DiscreteAction]) -> DiscreteAction:
    return random.choice(available_actions)

def policy(state: ContinousState, available_actions: List[DiscreteAction]) -> DiscreteAction:
    return DiscreteAction("2", 2, 2)

In [3]:
environment = BetDiceRolling(10)

exp = Experiment(environment)

In [4]:
agent = Agent(random_policy)

history = exp.explore(agent, 1_000)

100%|██████████| 1000/1000 [00:00<00:00, 2184.50it/s]


In [5]:
class Q(nn.Module):

    def __init__(self, sdim: int, adim: int) -> None:
        super().__init__()

        self.linear = nn.Sequential(
            nn.Linear(sdim, 128),
            nn.ReLU(),
            nn.Linear(128, adim),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.linear(x)

In [6]:
q = Q(1, 4)

In [7]:
x = history[0][0]
x = torch.from_numpy(x).float()

In [8]:
q(x.reshape(1, -1))

tensor([[0.0013, 0.7402, 0.1127, 0.1458]], grad_fn=<SoftmaxBackward0>)

In [13]:
states, actions, rewards, new_states = [], [], [], []

for transition in history:
    states.append(transition[0])
    actions.append(transition[1].number)
    rewards.append(transition[2])
    new_states.append(transition[3])

states = torch.tensor(states)
actions = torch.tensor(actions).reshape(-1, 1)
rewards = torch.tensor(rewards).reshape(-1, 1)
new_states = torch.tensor(new_states)