In [None]:
# 4.1 Study I

import gym
from EduSim.Envs.TMS import TMSEnv, tms_train_eval
from EduSim import MetaAgent

env: TMSEnv = gym.make(
    "TMS-v1",
    name="binary",
    mode="no_measurement_error",
    seed=10,
)

env

In [None]:
env.action_space

In [None]:
import mxnet.ndarray as nd
import numpy as np
from EduRec.RLA.DQN import DQNet, get_loss, fit_f, net_init
from EduRec.meta.ReplayBuffer import CircularReplayBuffer as ReplayBuffer
from longling.ML.MxnetHelper import get_trainer
from longling.ML.MxnetHelper.utils import Configuration

class Agent(MetaAgent):
    def __init__(self, action_space, seed=None):
        super(Agent, self).__init__()
        self.action_space = action_space

        self.observation = None
        self.action_idx = None
        self.reward = None

        self.value_net_cfg = Configuration()
        self.value_net_cfg.optimizer_params["learning_rate"] = 0.01
        self.value_net = DQNet(self.action_space.shape[0])
        net_init(self.value_net, self.value_net_cfg)
        self.value_net_tune_freq = 64
        self.value_net_tune_cnt = 0
        self.value_net_replay_buffer = ReplayBuffer(seed=seed)
        self.value_net_cfg.batch_size = 64
        self.value_net_loss = get_loss()
        self.value_net_trainer = get_trainer(
            self.value_net, self.value_net_cfg.optimizer, self.value_net_cfg.optimizer_params,
            self.value_net_cfg.lr_params
        )
        self._random_state = np.random.RandomState(seed)

    def begin_episode(self, learner_profile, *args, **kwargs):
        _, observation = learner_profile
        self.observation = observation

    def end_episode(self, observation, reward, done, info):
        pass

    def observe(self, observation, reward, done, info):
        self.reward = reward
        self.value_net_replay_buffer.add(
            [self.observation, self.action_idx, self.reward]
        )
        self.value_net_tune_cnt += 1
        if self.value_net_tune_cnt >= self.value_net_tune_freq:
            self.tune()
            self.value_net_tune_cnt -= self.value_net_tune_freq
        self.observation = observation

    def step(self):
        action_idx = int(self.value_net(nd.array([self.observation], self.value_net_cfg.ctx)).argmax().asscalar())
        self.action_idx = action_idx
        return self.action_space[action_idx]

    def tune(self, *args, **kwargs):
        observation, action, reward = zip(*self.value_net_replay_buffer.sample(self.value_net_cfg.batch_size))
        observation = nd.array(observation, dtype="float32")
        action = nd.array(action, dtype="int")
        reward = nd.array(reward, dtype="float32")
        fit_f(
            self.value_net, (observation, action, reward), self.value_net_loss, self.value_net_trainer,
            batch_size=self.value_net_cfg.batch_size,
            ctx=self.value_net_cfg.ctx
        )

agent = Agent(env.action_space, seed=10)
agent

In [None]:
from longling.ML.toolkit.monitor import MovingLoss
from EduSim.utils import ConsoleProgressMonitor, EMAValue

values = {
    "Episode": EMAValue(["Reward"])
}
loss_monitor = MovingLoss(agent.value_net_loss)
values.update({"Net": loss_monitor.losses})

indexes = {"Episode": ["Reward"], "Net": list(agent.value_net_loss)}
max_episode_num = 100 * 100  # 4.1.2

monitor = ConsoleProgressMonitor(
    indexes=indexes,
    values=values,
    total=max_episode_num,
    player_type="episode"
)

from longling import set_logging_info
set_logging_info()
tms_train_eval(
    agent,
    env,
    max_steps=2,
    max_episode_num=max_episode_num,  # 4.1.2
    level="summary",
    board_dir="./tms_binary",
    monitor=monitor,
    values=values,
)