<a href="https://colab.research.google.com/github/toanpt74/Workflow/blob/main/trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from job_env import job_shop_env
from agent import Agent, ActorCritic
from utils import v_wrap, plot_learning_curve
import csv


def train1(args):
    torch.manual_seed(args.seed)

    env = job_shop_env()

    model = ActorCritic(env.state_dim, env.action_dim)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = v_wrap(state)
    done = True
    action_dim = env.expert

    figure_file = './jss_a2c.png'

    episode_length = 0
    t_total_time = 0
    complete_jobs = []
    expert_complete_job = []
    complete_job_start_time = []
    update_list = []
    score_history = []
    for episode in range(args.episode):
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()
        if len(complete_jobs) != 0:
            update_list = [n for m in complete_jobs for n in m]
            env.update(update_list)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps + 1):
            episode_length += 1

            action, log_prob, entropy, value = model.choose_action((state, (hx, cx)), action_dim)

            log_prob = log_prob.gather(1, action)[0]

            state, reward, done, done_job, done_expert, job_start_time, total_time = env.step(action.view(-1, ).numpy())
            t_total_time += total_time

            done = done or episode_length >= args.max_episode_length
            ## reward shaping
            reward = max(min(reward, 1), -1)
            if episode_length % 20 == 0:
                print('reward: ', reward, 'total time: ', total_time)
                score_history.append(reward)
                # print(done_job)

            if done:
                complete_jobs.append(done_job)
                expert_complete_job.append(done_expert)
                complete_job_start_time.append(job_start_time)
                print('Current episode:', episode)
                episode_length = 0
                state = env.reset()

            state = v_wrap(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            entropies.append(entropy)
            if done:
                break

        if len(list(set(update_list))) > 8800:
            ## write results into the csv file
            with open('submit_{}.csv'.format(len(list(set(update_list)))), 'w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow(
                            [complete_jobs[i][j] + 1, expert_complete_job[i][j] + 1, complete_job_start_time[i][j]])

        if episode == args.episode - 1 or len(list(set(update_list))) == 8840:
            model.save_checkpoint()
            ## write results into the csv file
            with open('submit.csv', 'w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow(
                            [complete_jobs[i][j] + 1, expert_complete_job[i][j] + 1, complete_job_start_time[i][j]])
            break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                      values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                          log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward(torch.ones_like(policy_loss))
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        optimizer.step()
        # print(policy_loss.mean() + args.value_loss_coef * value_loss)
        print('para updated')
    x = [i + 1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history, figure_file)

# score_history = []
def train(args):

    torch.manual_seed(args.seed)

    torch.cuda.set_device(0)

    env = job_shop_env()

    batch_size = 2
    n_epochs = 4
    alpha = 0.0003
    beta = 0.001
    model = Agent(n_actions=env.action_dim, batch_size=batch_size,
                    alpha=alpha, n_epochs=n_epochs,
                    input_dims=env.state_dim, beta=beta)

    figure_file = './jss_ppo.png'


    state = env.reset()
    done = True
    action_dim = env.expert

    episode_length = 0
    t_reward = 0.0
    total_time = 0

    complete_jobs = []
    expert_complete_job = []
    complete_job_start_time = []
    update_list = []
    score_history = []
    time_history = []
    values = []
    log_probs = []
    rewards = []
    for episode in range(args.episode):
        state = env.reset()
        state = v_wrap(state)
        if len(complete_jobs) != 0:
            update_list = [n for m in complete_jobs for n in m]
            env.update(update_list)

        for step in range(args.num_steps + 1):

            episode_length += 1

            action, prob, value = model.choose_action(state, action_dim)
            action = action.to('cuda')
            prob = prob.gather(1, action)[0]



            state, reward, done, done_job, done_expert, job_start_time, total_time = env.step(action.cpu().view(-1,).numpy())

            done = done or episode_length >= args.max_episode_length
            ## reward shaping
            t_reward = max(min(reward, 1), -1)
            total_time += total_time
            model.remember(state, action, prob, value, t_reward, done)

            if episode_length % 25 == 0:
                model.learn()
                print('para update-', t_reward,'total time-', total_time)
                score_history.append(t_reward)
                time_history.append(total_time)

                complete_jobs.append(done_job)
                expert_complete_job.append(done_expert)
                complete_job_start_time.append(job_start_time)
                print('Complete these jobs with 100 iterations:')
                print(complete_jobs)

            values.append(value)
            log_probs.append(prob)
            rewards.append(t_reward)

            if done:
                break

        if len(list(set(update_list))) > 8800:
            ## write results into the csv file
            with open('submit_{}.csv'.format(len(list(set(update_list)))), 'w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow(
                            [complete_jobs[i][j] + 1, expert_complete_job[i][j] + 1, complete_job_start_time[i][j]])

        if episode == args.episode - 1 or len(list(set(update_list))) == 8840:
            ## write results into the csv file
            with open('submit_ppo.csv', 'w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow(
                            [complete_jobs[i][j] + 1, expert_complete_job[i][j] + 1, complete_job_start_time[i][j]])
            break

        if t_reward >= 0.55 or episode >= 148:
            model.save_models()
        print('episode:', episode, 'reward:', t_reward, 'total time:', total_time)

    x = [i + 1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history, figure_file)

# print(score_history)



