<a href="https://colab.research.google.com/github/toanpt74/Workflow/blob/main/train_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

from job_env import job_shop_env
from Baseline_Q.dqn import dqn, replay_buffer
import torch.optim as optim
from utils import v_wrap
import torch.nn as nn

def training(buffer, batch_size, model, optimizer, gamma, loss_fn):
    observation, action, reward, next_observation, done = buffer.sample(batch_size)

    observation = torch.FloatTensor(observation)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_observation = torch.FloatTensor(next_observation)
    done = torch.FloatTensor(done)

    q_values = model.forward(observation)
    next_q_values = model.forward(next_observation)

    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_values.max(1)[0].detach()
    expected_q_value = reward + next_q_value * (1 - done) * gamma

    loss = loss_fn(q_value, expected_q_value.detach())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
def training_baseline(args):
    torch.manual_seed(args.seed)

    env = job_shop_env()

    model = dqn(env.state_dim, env.action_dim)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    model.train()

    state = env.reset()
    state = v_wrap(state)
    done = True
    action_dim = env.expert

    epsilon_init = 0.9
    capacity = 100000
    exploration = 100000
    buffer = replay_buffer(capacity)
    epsilon = epsilon_init
    weight_reward = None
    episode_length = 0
    complete_jobs = []
    expert_complete_job = []
    complete_job_start_time = []
    update_list = []
    loss_fn = nn.MSELoss()

    for episode in range(args.episode):
        obs = env.reset()
        # state = v_wrap(state)
        if len(complete_jobs) != 0:
            update_list = [n for m in complete_jobs for n in m]
            env.update(update_list)

        reward_total = 0

        for step in range(args.num_steps+1):
            episode_length += 1

            action = model.get_action()
            next_obs, reward, done, done_job, done_expert, job_start_time = env.step(action.view(-1,).numpy())
            done = done or episode_length >= args.max_episode_length
            buffer.store(obs, action, reward, next_obs, done)
            reward_total += reward
            obs = next_obs

            if len(buffer) > exploration:
                training(buffer=buffer,batch_size=16, model=model, optimizer=optimizer, gamma=args.gamma, loss_fn=loss_fn)
            if done:
                if not weight_reward:
                    weight_reward = reward_total
                else:
                    weight_reward = 0.99 * weight_reward + 0.01 * reward_total
                print('episode: {} reward: {} epsilon: {:.5f} weight_reward: {:.5f}').format(step+1, reward_total, epsilon, weight_reward)



