In [1]:
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import random

MIN_LIST_LEN = 6
MAX_LIST_LEN = 6
MAX_STEPS = 120
SUCCESS_REWARD = 10.0
EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
GAMMA = 0.98
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = 'datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    '0': 2,
    '1': 3,
    '2': 4,
    '3': 5,
    '4': 6,
    '5': 7,
    '6': 8,
    '7': 9,
    'less': 10,
    'equal': 11,
    'more': 12,
    'len1': 13,
    'len2': 14,
    'len3': 15,
    'len4': 16,
    'len5': 17,
    'len6': 18,
    'len7': 19,
    'len8': 20,
}
inv_vocab = {v: k for k, v in vocab.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        initial_token = 'len{}'.format(self.length)
        return vocab[initial_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_list_len(self):
        return len(self.list)

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 >= self.length or index2 >= self.length or index1 < 0 or index2 < 0:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = -0.01
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = -0.02
            else:
                response_token = vocab['more']
                reward = -0.01
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = 0.1
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -0.1
            else:
                reward = -0.01
            self.indices = None
        else:
            reward = -1.0
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [None]:
# Training Loop
def train(verbose=False):
    # Removed torch.autograd.set_detect_anomaly(True)
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    # Optionally, load a checkpoint
    # load_checkpoint("path_to_checkpoint", model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    for episode in range(NUM_EPISODES):
        t1 = time.time()
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, current_list = env.reset()
        input_tokens = [initial_token_id]
        log_probs = []
        rewards = []
        state = 'expect_action'
        done = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
                output = model(input_seq)  # (seq_len, batch_size, vocab_size)
                # Get logits for the last token
                logits = output[-1, 0, :]  # (vocab_size)

                # Check for NaNs in logits
                if torch.isnan(logits).any():
                    print(f"Episode {episode}, NaNs in logits before masking.")
                    break

                # Get valid tokens based on state
                def get_valid_tokens(state):
                    action_tokens = [vocab['Comparison'], vocab['Swap']]
                    index_tokens = [vocab[str(i)] for i in range(env.length)]
                    if state == 'expect_action':
                        return action_tokens
                    elif state == 'expect_index1':
                        return index_tokens[:-1]
                    elif state == 'expect_index2':
                        return [x for x in index_tokens if x > input_tokens[-1]]
                    else:
                        # Handle unexpected states by defaulting to expect_action
                        return action_tokens

                valid_token_ids = get_valid_tokens(state)

                # Ensure valid_token_ids are within the vocab range
                if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                    print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                    break

                # Mask invalid tokens
                mask_value = -1e9  # Use a large negative value instead of -inf
                mask = torch.full_like(logits, mask_value).to(device)
                mask[valid_token_ids] = 0
                masked_logits = logits + mask

                # Sample action. Have some chance to randomly pick a valid action.
                eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
                if random.random() < eps_threshold:
                    masked_logits = masked_logits / 4

                # Check for NaNs in masked_logits
                if torch.isnan(masked_logits).any():
                    print(f"Episode {episode}, NaNs in masked_logits after masking.")
                    break

                # Compute probabilities
                probs = F.softmax(masked_logits, dim=0)

                # Check for NaNs in probs
                if torch.isnan(probs).any():
                    print(f"Episode {episode}, NaNs in probs after softmax.")
                    break

                try:
                    m = torch.distributions.Categorical(probs)
                    action_token = m.sample()
                    log_prob = m.log_prob(action_token)
                except ValueError as e:
                    print(f"Episode {episode}, error in sampling action: {e}")
                    break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        reward = -1.0
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if verbose:
                        print("Reward:", reward)
                    state = 'expect_action'
                else:
                    reward = -1.0
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                response_token, reward, done, current_list = env.step(action_tokens)
                if verbose:
                    print("Reward:", reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                reward = -1.0
                done = True

            rewards.append(reward)
        #
        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / (episode_cnt + 0.00001)
            episode_cnt = 0
            total_reward = 0.0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{avg_reward:.4f}.pth")
        #
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"

        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns and loss within autocast
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            # Compute returns
            returns = []
            R = 0
            gamma = GAMMA
            for r in rewards[::-1]:
                R = r + gamma * R
                returns.insert(0, R)
            returns = torch.tensor(returns).to(device)

            # Check for NaNs in returns
            if torch.isnan(returns).any():
                print(f"Episode {episode}, NaNs in returns.")
                continue

            # Compute loss
            loss = 0
            for log_prob, R in zip(log_probs, returns):
                loss -= log_prob * R

            # Check for NaNs in loss
            if torch.isnan(loss):
                print(f"Episode {episode}, NaN in loss.")
                continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        t2 = time.time()
        if episode % 1 == 0:
            print(f"Episode {episode}, loss: {loss.item():.4f}, total reward: {sum(rewards):.4f}, {t2-t1} sec")

if __name__ == "__main__":
    train(verbose=False)




Episode 0, loss: -6.3606, total reward: -0.9200, 0.09358549118041992 sec
Episode 1, loss: -0.3700, total reward: -1.0000, 0.004981279373168945 sec
Episode 2, loss: -3.3361, total reward: -0.9100, 0.008436203002929688 sec
Episode 3, loss: -10.5306, total reward: -0.8400, 0.020661592483520508 sec
Episode 4, loss: -30.1035, total reward: -0.9800, 0.07024240493774414 sec
Episode 5, loss: -47.4489, total reward: -1.2500, 0.0953364372253418 sec
Episode 6, loss: -8.9686, total reward: -0.9400, 0.0214388370513916 sec
Episode 7, loss: -37.4582, total reward: -1.2200, 0.07979321479797363 sec
Episode 8, loss: -12.2345, total reward: -0.3900, 0.10324811935424805 sec
Episode 9, loss: 0.0541, total reward: -0.0900, 0.10496091842651367 sec
Episode 10, loss: -9.2524, total reward: -0.3900, 0.10399293899536133 sec
Episode 11, loss: 0.2073, total reward: -0.0900, 0.10507798194885254 sec
Episode 12, loss: -14.3039, total reward: -0.5800, 0.1052086353302002 sec
Episode 13, loss: -7.2800, total reward: -0.

Episode 111, loss: 2.8904, total reward: 0.2300, 0.10985112190246582 sec
Episode 112, loss: -1.9336, total reward: -1.0000, 0.008538007736206055 sec
Episode 113, loss: -6.0208, total reward: -0.1800, 0.09936189651489258 sec
Episode 114, loss: 114.4847, total reward: 10.0400, 0.027384042739868164 sec
Episode 115, loss: -5.1915, total reward: -0.9200, 0.012547731399536133 sec
Episode 116, loss: -2.1030, total reward: 0.0300, 0.0980064868927002 sec
Episode 117, loss: -6.3912, total reward: -0.2400, 0.10170364379882812 sec
Episode 118, loss: -30.8365, total reward: -0.8000, 0.0777590274810791 sec
Episode 119, loss: -2.1235, total reward: -1.0000, 0.00784444808959961 sec
Episode 120, loss: -3.6183, total reward: -0.1900, 0.0955955982208252 sec
Episode 121, loss: -0.9042, total reward: 0.2200, 0.1038045883178711 sec
Episode 122, loss: 102.5285, total reward: 10.0300, 0.0315401554107666 sec
Episode 123, loss: 2.7702, total reward: 0.1200, 0.09867453575134277 sec
Episode 124, loss: 289.6273, t

Episode 222, loss: 0.1402, total reward: 0.0100, 0.0991981029510498 sec
Episode 223, loss: -1.9693, total reward: -0.0900, 0.10426020622253418 sec
Episode 224, loss: -1.2167, total reward: 0.1200, 0.1056373119354248 sec
Episode 225, loss: 3.3047, total reward: 0.2200, 0.10426044464111328 sec
Episode 226, loss: 6.8127, total reward: 0.3200, 0.1103215217590332 sec
Episode 227, loss: -10.2798, total reward: -0.6600, 0.03155875205993652 sec
Episode 228, loss: -17.1201, total reward: -0.7000, 0.0405881404876709 sec
Episode 229, loss: -2.8495, total reward: 0.1100, 0.10246753692626953 sec
Episode 230, loss: -3.9173, total reward: 0.0100, 0.10557794570922852 sec
Episode 231, loss: -1.6699, total reward: 0.0100, 0.10772919654846191 sec
Episode 232, loss: -28.0476, total reward: -0.8600, 0.10210657119750977 sec
Episode 233, loss: -2.0835, total reward: 0.0100, 0.10726141929626465 sec
Episode 234, loss: 0.0429, total reward: 0.1200, 0.10942912101745605 sec
Episode 235, loss: 8.1881, total reward

Episode 333, loss: -3.3140, total reward: -0.0500, 0.10777044296264648 sec
Episode 334, loss: 5.1703, total reward: 0.3200, 0.11042022705078125 sec
Episode 335, loss: 2.0383, total reward: 0.2200, 0.10749244689941406 sec
Episode 336, loss: 274.0443, total reward: 10.2500, 0.09708118438720703 sec
Episode 337, loss: -2.5608, total reward: 0.1900, 0.1080939769744873 sec
Episode 338, loss: -3.0078, total reward: -0.1500, 0.10808801651000977 sec
Episode 339, loss: 4.3027, total reward: 0.3200, 0.10934591293334961 sec
Episode 340, loss: 2.3482, total reward: 0.2200, 0.10942435264587402 sec
Episode 341, loss: 3.0560, total reward: 0.1200, 0.10946202278137207 sec
Episode 342, loss: 323.6197, total reward: 10.2400, 0.09818053245544434 sec
Episode 343, loss: -1.3477, total reward: 0.1200, 0.10454845428466797 sec
Episode 344, loss: 0.9591, total reward: 0.2200, 0.10493898391723633 sec
Episode 345, loss: -0.9734, total reward: 0.1100, 0.10500693321228027 sec
Episode 346, loss: 245.2849, total rewa

Episode 444, loss: 289.6429, total reward: 10.4600, 0.09267640113830566 sec
Episode 445, loss: 9.8128, total reward: 0.7300, 0.10517096519470215 sec
Episode 446, loss: -8.3573, total reward: -0.9300, 0.019533634185791016 sec
Episode 447, loss: 206.2925, total reward: 10.0800, 0.04352092742919922 sec
Episode 448, loss: -14.2189, total reward: -0.9000, 0.03864121437072754 sec
Episode 449, loss: 5.2139, total reward: 0.4300, 0.10043740272521973 sec
Episode 450, loss: 0.0143, total reward: 0.1800, 0.10495758056640625 sec
Episode 451, loss: 239.0198, total reward: 10.1500, 0.059479713439941406 sec
Episode 452, loss: -1.3462, total reward: 0.0100, 0.0997781753540039 sec
Episode 453, loss: -0.1692, total reward: 0.1900, 0.10534024238586426 sec
Episode 454, loss: 4.4467, total reward: 0.1900, 0.10422348976135254 sec
Episode 455, loss: 1.7856, total reward: 0.1100, 0.10454273223876953 sec
Episode 456, loss: 0.3896, total reward: 0.2200, 0.10446453094482422 sec
Episode 457, loss: 2.5987, total r

Episode 555, loss: 267.5339, total reward: 10.2900, 0.08388161659240723 sec
Episode 556, loss: 298.9124, total reward: 10.3200, 0.10611414909362793 sec
Episode 557, loss: -2.7607, total reward: -1.0000, 0.008674383163452148 sec
Episode 558, loss: -3.4673, total reward: -0.0900, 0.09900331497192383 sec
Episode 559, loss: 35.7736, total reward: 10.0800, 0.014661073684692383 sec
Episode 560, loss: 0.7024, total reward: 0.2200, 0.10371184349060059 sec
Episode 561, loss: 207.3350, total reward: 10.1700, 0.08896803855895996 sec
Episode 562, loss: 267.4049, total reward: 10.6800, 0.09260344505310059 sec
Episode 563, loss: 270.0898, total reward: 10.2800, 0.08347749710083008 sec
Episode 564, loss: -4.2045, total reward: -0.1900, 0.10263466835021973 sec
Episode 565, loss: 262.8794, total reward: 10.0500, 0.0910654067993164 sec
Episode 566, loss: 245.2501, total reward: 10.2700, 0.08010482788085938 sec
Episode 567, loss: 96.0365, total reward: 10.0500, 0.024423599243164062 sec
Episode 568, loss:

Episode 665, loss: 6.5375, total reward: 0.5200, 0.10580301284790039 sec
Episode 666, loss: 1.4165, total reward: 0.1100, 0.10582923889160156 sec
Episode 667, loss: 1.2091, total reward: 0.0700, 0.10514354705810547 sec
Episode 668, loss: 276.2431, total reward: 10.2300, 0.06735587120056152 sec
Episode 669, loss: 7.8003, total reward: 0.5300, 0.10303807258605957 sec
Episode 670, loss: -0.0886, total reward: 0.2200, 0.10448718070983887 sec
Episode 671, loss: 4.7771, total reward: 0.4200, 0.10875558853149414 sec
Episode 672, loss: -0.0161, total reward: 0.2000, 0.10478448867797852 sec
Episode 673, loss: 5.4142, total reward: 0.4300, 0.10626506805419922 sec
Episode 674, loss: 108.0175, total reward: 10.2300, 0.03449583053588867 sec
Episode 675, loss: 273.7797, total reward: 10.4300, 0.09372925758361816 sec
Episode 676, loss: 11.7590, total reward: 9.9900, 0.011299610137939453 sec
Episode 677, loss: 4.7077, total reward: 0.5200, 0.09841609001159668 sec
Episode 678, loss: 113.0600, total rew

Episode 776, loss: 244.9840, total reward: 10.3300, 0.09866023063659668 sec
Episode 777, loss: 107.6514, total reward: 10.2300, 0.03245353698730469 sec
Episode 778, loss: 276.2293, total reward: 10.6600, 0.09178924560546875 sec
Episode 779, loss: -23.2982, total reward: -0.6200, 0.08450698852539062 sec
Episode 780, loss: 4.1818, total reward: 0.2200, 0.10560154914855957 sec
Episode 781, loss: 2.5145, total reward: 0.2200, 0.11565518379211426 sec
Episode 782, loss: 9.8032, total reward: 0.7300, 0.11258554458618164 sec
Episode 783, loss: 3.0775, total reward: 0.3900, 0.10826659202575684 sec
Episode 784, loss: 196.1696, total reward: 10.1800, 0.05345797538757324 sec
Episode 785, loss: -2.3578, total reward: -0.1900, 0.09989023208618164 sec
Episode 786, loss: 274.1151, total reward: 10.6000, 0.08055281639099121 sec
Episode 787, loss: 275.3550, total reward: 10.4400, 0.09678196907043457 sec
Episode 788, loss: 186.0164, total reward: 10.0700, 0.051082611083984375 sec
Episode 789, loss: 290.1

Episode 886, loss: 5.5462, total reward: 0.5200, 0.10518383979797363 sec
Episode 887, loss: -3.7709, total reward: -0.1900, 0.10747981071472168 sec
Episode 888, loss: 9.7698, total reward: 0.5200, 0.11177301406860352 sec
Episode 889, loss: 226.2519, total reward: 10.3100, 0.06275510787963867 sec
Episode 890, loss: 10.2284, total reward: 0.6300, 0.10303878784179688 sec
Episode 891, loss: 5.4080, total reward: 0.4200, 0.10473132133483887 sec
Episode 892, loss: 0.7172, total reward: 0.3200, 0.10468411445617676 sec
Episode 893, loss: 2.9679, total reward: 0.3200, 0.10415816307067871 sec
Episode 894, loss: 263.2272, total reward: 10.2900, 0.08102965354919434 sec
Episode 895, loss: 7.9761, total reward: 0.5200, 0.103851318359375 sec
Episode 896, loss: 214.3889, total reward: 10.0600, 0.05483579635620117 sec
Episode 897, loss: 236.9657, total reward: 10.0600, 0.05166482925415039 sec
Episode 898, loss: 271.9011, total reward: 10.0300, 0.06081581115722656 sec
Episode 899, loss: 15.3182, total r

Episode 996, loss: 295.3682, total reward: 10.1300, 0.09733796119689941 sec
Episode 997, loss: 2.8330, total reward: 0.1100, 0.1043393611907959 sec
Episode 998, loss: 3.4869, total reward: 0.3200, 0.10442781448364258 sec
Episode 999, loss: 237.2422, total reward: 10.4400, 0.06767773628234863 sec
Checkpoint saved at episode 1000 to datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120/ckpt_1000_2.9626.pth
Episode 1000, loss: 183.2238, total reward: 10.0600, 0.07148909568786621 sec
Episode 1001, loss: -0.8712, total reward: 0.1100, 0.1026153564453125 sec
Episode 1002, loss: 202.7870, total reward: 10.0700, 0.0785977840423584 sec
Episode 1003, loss: -7.6418, total reward: -0.9200, 0.014617204666137695 sec
Episode 1004, loss: 3.7483, total reward: 0.3200, 0.09922099113464355 sec
Episode 1005, loss: 239.8435, total reward: 10.1900, 0.08419513702392578 sec
Episode 1006, loss: 2.2831, total reward: 0.2200, 0.10448813438415527 sec
Episode 1007, loss: 283.1508, total reward:

Episode 1103, loss: 244.3289, total reward: 10.4900, 0.08324599266052246 sec
Episode 1104, loss: 5.2875, total reward: 0.3200, 0.11381149291992188 sec
Episode 1105, loss: 186.8265, total reward: 10.2100, 0.04044151306152344 sec
Episode 1106, loss: 288.0971, total reward: 10.3600, 0.08909773826599121 sec
Episode 1107, loss: 15.9316, total reward: 0.5300, 0.10968732833862305 sec
Episode 1108, loss: 9.4333, total reward: 0.6300, 0.10549545288085938 sec
Episode 1109, loss: 256.5051, total reward: 10.4300, 0.06932687759399414 sec
Episode 1110, loss: 249.0145, total reward: 10.3300, 0.06606030464172363 sec
Episode 1111, loss: 4.3311, total reward: 0.4200, 0.10164451599121094 sec
Episode 1112, loss: 198.3367, total reward: 10.1800, 0.04954099655151367 sec
Episode 1113, loss: 205.1470, total reward: 10.0700, 0.048441410064697266 sec
Episode 1114, loss: 10.4669, total reward: 0.7300, 0.10136270523071289 sec
Episode 1115, loss: 10.1797, total reward: 0.6300, 0.10489106178283691 sec
Episode 1116,

Episode 1212, loss: 5.8505, total reward: 0.6300, 0.10468840599060059 sec
Episode 1213, loss: 162.6654, total reward: 10.2000, 0.04057955741882324 sec
Episode 1214, loss: 275.8019, total reward: 9.9300, 0.09192013740539551 sec
Episode 1215, loss: 3.8554, total reward: 0.4200, 0.1033170223236084 sec
Episode 1216, loss: 184.3380, total reward: 10.1600, 0.05534100532531738 sec
Episode 1217, loss: 135.2648, total reward: 10.0400, 0.026253461837768555 sec
Episode 1218, loss: 1.4692, total reward: 0.2200, 0.09913969039916992 sec
Episode 1219, loss: 280.9834, total reward: 10.2800, 0.08340334892272949 sec
Episode 1220, loss: 4.8341, total reward: 0.3200, 0.10242533683776855 sec
Episode 1221, loss: 3.2694, total reward: 0.3200, 0.1045994758605957 sec
Episode 1222, loss: 12.6846, total reward: 0.5900, 0.10544323921203613 sec
Episode 1223, loss: 125.4113, total reward: 10.2300, 0.03400230407714844 sec
Episode 1224, loss: 300.4237, total reward: 9.9200, 0.09455347061157227 sec
Episode 1225, loss:

Episode 1320, loss: 262.3848, total reward: 10.2400, 0.08577966690063477 sec
Episode 1321, loss: 211.7526, total reward: 10.0700, 0.050383567810058594 sec
Episode 1322, loss: 134.0734, total reward: 10.1200, 0.033060312271118164 sec
Episode 1323, loss: -9.4998, total reward: -0.7400, 0.020089149475097656 sec
Episode 1324, loss: 209.7041, total reward: 9.9800, 0.04374837875366211 sec
Episode 1325, loss: 13.0075, total reward: 0.9300, 0.1037909984588623 sec
Episode 1326, loss: 266.3623, total reward: 10.3500, 0.06125140190124512 sec
Episode 1327, loss: 268.3427, total reward: 10.4000, 0.07782864570617676 sec
Episode 1328, loss: 237.6614, total reward: 10.0900, 0.06723976135253906 sec
Episode 1329, loss: 273.5825, total reward: 9.8700, 0.08227181434631348 sec
Episode 1330, loss: 186.5364, total reward: 10.3800, 0.050591230392456055 sec
Episode 1331, loss: -2.7440, total reward: 0.2200, 0.1053323745727539 sec
Episode 1332, loss: 252.6108, total reward: 10.7700, 0.09549260139465332 sec
Epis

Episode 1429, loss: 6.4340, total reward: 0.3200, 0.10139608383178711 sec
Episode 1430, loss: 6.1315, total reward: 0.5200, 0.10532522201538086 sec
Episode 1431, loss: 3.6024, total reward: 0.3200, 0.10448694229125977 sec
Episode 1432, loss: 196.2226, total reward: 10.1600, 0.05596208572387695 sec
Episode 1433, loss: 219.8933, total reward: 10.2000, 0.06957459449768066 sec
Episode 1434, loss: 189.8944, total reward: 10.4000, 0.07766270637512207 sec
Episode 1435, loss: 247.6347, total reward: 10.7800, 0.08699727058410645 sec
Episode 1436, loss: 283.5415, total reward: 10.1500, 0.09084868431091309 sec
Episode 1437, loss: 288.7383, total reward: 10.3700, 0.08710813522338867 sec
Episode 1438, loss: -2.8745, total reward: 0.0100, 0.10214424133300781 sec
Episode 1439, loss: 18.2363, total reward: 9.9900, 0.012037277221679688 sec
Episode 1440, loss: 297.9097, total reward: 10.1300, 0.09221172332763672 sec
Episode 1441, loss: 237.2328, total reward: 10.2300, 0.06822538375854492 sec
Episode 144

Episode 1537, loss: 244.3103, total reward: 10.7400, 0.10395956039428711 sec
Episode 1538, loss: 115.1498, total reward: 10.4100, 0.04228496551513672 sec
Episode 1539, loss: 238.8505, total reward: 10.4800, 0.0839087963104248 sec
Episode 1540, loss: 275.6027, total reward: 10.2800, 0.08233880996704102 sec
Episode 1541, loss: 143.0930, total reward: 10.4800, 0.05095648765563965 sec
Episode 1542, loss: 8.0403, total reward: 0.8300, 0.10147738456726074 sec
Episode 1543, loss: -1.5908, total reward: 0.1100, 0.10384488105773926 sec
Episode 1544, loss: 228.0151, total reward: 10.3600, 0.09102702140808105 sec
Episode 1545, loss: 220.8137, total reward: 10.4800, 0.08963227272033691 sec
Episode 1546, loss: -2.8989, total reward: 0.0100, 0.10600996017456055 sec
Episode 1547, loss: 130.8002, total reward: 10.0000, 0.04179239273071289 sec
Episode 1548, loss: 6.2160, total reward: 0.6300, 0.1081228256225586 sec
Episode 1549, loss: 9.8438, total reward: 0.6300, 0.10872745513916016 sec
Episode 1550, 

Episode 1645, loss: 261.3153, total reward: 10.3300, 0.0984196662902832 sec
Episode 1646, loss: 260.6298, total reward: 10.3500, 0.09371304512023926 sec
Episode 1647, loss: 223.4357, total reward: 10.1000, 0.0778498649597168 sec
Episode 1648, loss: -1.1862, total reward: 0.0100, 0.10447287559509277 sec
Episode 1649, loss: 231.9679, total reward: 10.1300, 0.08068418502807617 sec
Episode 1650, loss: 238.9209, total reward: 10.3700, 0.08842253684997559 sec
Episode 1651, loss: 7.7990, total reward: 0.6300, 0.10791015625 sec
Episode 1652, loss: 226.8503, total reward: 10.4000, 0.08078837394714355 sec
Episode 1653, loss: 177.5055, total reward: 10.2300, 0.06846117973327637 sec
Episode 1654, loss: 0.9968, total reward: 0.1200, 0.10501527786254883 sec
Episode 1655, loss: 259.2998, total reward: 10.5200, 0.0744013786315918 sec
Episode 1656, loss: 10.0896, total reward: 0.8300, 0.10943937301635742 sec
Episode 1657, loss: -0.9464, total reward: 0.1100, 0.10506749153137207 sec
Episode 1658, loss: 

Episode 1754, loss: 232.4231, total reward: 10.8400, 0.10451865196228027 sec
Episode 1755, loss: 175.4187, total reward: 10.1600, 0.06321287155151367 sec
Episode 1756, loss: 278.5382, total reward: 10.4300, 0.10436058044433594 sec
Episode 1757, loss: 230.4176, total reward: 10.6100, 0.0750875473022461 sec
Episode 1758, loss: 180.1840, total reward: 10.6500, 0.06267189979553223 sec
Episode 1759, loss: 196.5158, total reward: 10.2800, 0.048024654388427734 sec
Episode 1760, loss: 213.7385, total reward: 10.6700, 0.08745551109313965 sec
Episode 1761, loss: 172.8917, total reward: 10.0800, 0.046747446060180664 sec
Episode 1762, loss: 226.8204, total reward: 10.4000, 0.07654190063476562 sec
Episode 1763, loss: 86.3956, total reward: 10.1400, 0.02786707878112793 sec
Episode 1764, loss: 194.2098, total reward: 10.2300, 0.06148576736450195 sec
Episode 1765, loss: 232.1831, total reward: 10.2500, 0.057239532470703125 sec
Episode 1766, loss: 245.6886, total reward: 10.6000, 0.07681560516357422 se

Episode 1862, loss: 5.7146, total reward: 0.5200, 0.10405230522155762 sec
Episode 1863, loss: 12.5394, total reward: 0.7300, 0.10959792137145996 sec
Episode 1864, loss: 3.4381, total reward: 0.4200, 0.10711097717285156 sec
Episode 1865, loss: 105.4269, total reward: 10.3500, 0.030783891677856445 sec
Episode 1866, loss: 0.7646, total reward: 0.1100, 0.1005864143371582 sec
Episode 1867, loss: 11.5596, total reward: 0.7300, 0.10528063774108887 sec
Episode 1868, loss: 223.6147, total reward: 10.0300, 0.06379342079162598 sec
Episode 1869, loss: 108.8861, total reward: 10.0300, 0.029592514038085938 sec
Episode 1870, loss: 221.2978, total reward: 10.3900, 0.07669258117675781 sec
Episode 1871, loss: 195.8822, total reward: 10.2100, 0.07090020179748535 sec
Episode 1872, loss: 306.9703, total reward: 10.6300, 0.10197687149047852 sec
Episode 1873, loss: 230.2083, total reward: 10.1400, 0.06131124496459961 sec
Episode 1874, loss: 2.5991, total reward: 0.4200, 0.10098671913146973 sec
Episode 1875, 

Episode 1971, loss: 4.4759, total reward: 0.6300, 0.10298323631286621 sec
Episode 1972, loss: 9.4697, total reward: 0.6300, 0.10981440544128418 sec
Episode 1973, loss: 6.9079, total reward: 0.5200, 0.10880780220031738 sec
Episode 1974, loss: 6.5771, total reward: 0.5200, 0.10889840126037598 sec
Episode 1975, loss: 183.2369, total reward: 10.3800, 0.05228471755981445 sec
Episode 1976, loss: 98.1290, total reward: 10.3300, 0.03294563293457031 sec
Episode 1977, loss: 3.0979, total reward: 0.3200, 0.10163497924804688 sec
Episode 1978, loss: 9.3366, total reward: 0.7300, 0.10708355903625488 sec
Episode 1979, loss: 4.9125, total reward: 0.2200, 0.10732030868530273 sec
Episode 1980, loss: 184.2297, total reward: 10.0600, 0.06263351440429688 sec
Episode 1981, loss: 238.1104, total reward: 10.4400, 0.09803056716918945 sec
Episode 1982, loss: 90.4532, total reward: 10.0400, 0.027290821075439453 sec
Episode 1983, loss: 243.4995, total reward: 10.4200, 0.06912469863891602 sec
Episode 1984, loss: -

Episode 2078, loss: 5.5660, total reward: 0.4200, 0.1044015884399414 sec
Episode 2079, loss: 3.1122, total reward: 0.2200, 0.1040809154510498 sec
Episode 2080, loss: 167.0978, total reward: 10.1900, 0.04608869552612305 sec
Episode 2081, loss: 185.0874, total reward: 10.2100, 0.0696723461151123 sec
Episode 2082, loss: 226.6784, total reward: 10.6900, 0.0827779769897461 sec
Episode 2083, loss: 239.0253, total reward: 10.2400, 0.09479904174804688 sec
Episode 2084, loss: 12.4086, total reward: 0.9300, 0.10643577575683594 sec
Episode 2085, loss: 211.7805, total reward: 10.3500, 0.0631248950958252 sec
Episode 2086, loss: 11.3753, total reward: 1.0400, 0.1034994125366211 sec
Episode 2087, loss: 175.6170, total reward: 10.0600, 0.054791927337646484 sec
Episode 2088, loss: 4.6053, total reward: 0.3200, 0.10068798065185547 sec
Episode 2089, loss: 248.5956, total reward: 10.5600, 0.09323835372924805 sec
Episode 2090, loss: 138.2581, total reward: 10.3100, 0.04041719436645508 sec
Episode 2091, los

Episode 2186, loss: 224.4594, total reward: 10.3700, 0.08782291412353516 sec
Episode 2187, loss: -3.9568, total reward: -1.0000, 0.008559703826904297 sec
Episode 2188, loss: 229.5261, total reward: 10.4800, 0.0787806510925293 sec
Episode 2189, loss: 262.3729, total reward: 10.3300, 0.09920954704284668 sec
Episode 2190, loss: 224.2157, total reward: 10.6700, 0.09085226058959961 sec
Episode 2191, loss: 215.2408, total reward: 10.4400, 0.09738016128540039 sec
Episode 2192, loss: 85.8935, total reward: 10.1300, 0.02920675277709961 sec
Episode 2193, loss: 255.2655, total reward: 10.4800, 0.08054280281066895 sec
Episode 2194, loss: 254.5618, total reward: 10.3800, 0.08298921585083008 sec
Episode 2195, loss: 143.2203, total reward: 10.0200, 0.04979705810546875 sec
Episode 2196, loss: 11.5001, total reward: 0.8300, 0.1009218692779541 sec
Episode 2197, loss: 2.9362, total reward: 0.2200, 0.10398221015930176 sec
Episode 2198, loss: 242.1826, total reward: 10.3900, 0.08241415023803711 sec
Episode

Episode 2296, loss: 9.8117, total reward: 0.4200, 0.10147905349731445 sec
Episode 2297, loss: 8.7886, total reward: 0.7300, 0.10777568817138672 sec
Episode 2298, loss: 204.7625, total reward: 10.5500, 0.10003876686096191 sec
Episode 2299, loss: 228.4029, total reward: 10.3300, 0.10417485237121582 sec
Episode 2300, loss: 138.1941, total reward: 10.4700, 0.0564422607421875 sec
Episode 2301, loss: 197.0887, total reward: 10.3800, 0.08423614501953125 sec
Episode 2302, loss: 4.4016, total reward: 0.6000, 0.10658407211303711 sec
Episode 2303, loss: 183.1486, total reward: 10.6100, 0.07738375663757324 sec
Episode 2304, loss: 0.9129, total reward: 0.1100, 0.10291647911071777 sec
Episode 2305, loss: 111.7636, total reward: 10.0800, 0.03911280632019043 sec
Episode 2306, loss: 211.1957, total reward: 9.9800, 0.07531189918518066 sec
Episode 2307, loss: 227.0200, total reward: 10.6600, 0.09228110313415527 sec
Episode 2308, loss: 256.9274, total reward: 10.7800, 0.08857440948486328 sec
Episode 2309,

Episode 2404, loss: 6.9854, total reward: 0.5200, 0.10462570190429688 sec
Episode 2405, loss: 194.4331, total reward: 10.2300, 0.06728339195251465 sec
Episode 2406, loss: 3.6247, total reward: 0.6300, 0.10353660583496094 sec
Episode 2407, loss: 133.4809, total reward: 10.1900, 0.04641127586364746 sec
Episode 2408, loss: 185.7176, total reward: 10.3600, 0.05457925796508789 sec
Episode 2409, loss: 193.3446, total reward: 10.1400, 0.05926656723022461 sec
Episode 2410, loss: 159.6549, total reward: 9.9600, 0.0503849983215332 sec
Episode 2411, loss: 4.7516, total reward: 0.5200, 0.10171198844909668 sec
Episode 2412, loss: 200.6710, total reward: 10.3000, 0.07735586166381836 sec
Episode 2413, loss: 184.5523, total reward: 10.3700, 0.05249476432800293 sec
Episode 2414, loss: 192.8006, total reward: 10.5800, 0.08342480659484863 sec
Episode 2415, loss: -3.9117, total reward: -0.0900, 0.1020350456237793 sec
Episode 2416, loss: 263.8426, total reward: 10.3100, 0.0740361213684082 sec
Episode 2417,

Episode 2513, loss: 232.0824, total reward: 10.4500, 0.09926843643188477 sec
Episode 2514, loss: 185.5372, total reward: 10.1800, 0.049648284912109375 sec
Episode 2515, loss: 197.6792, total reward: 10.4000, 0.07717657089233398 sec
Episode 2516, loss: 217.1958, total reward: 9.9000, 0.07207107543945312 sec
Episode 2517, loss: -1.1548, total reward: 0.1100, 0.11108803749084473 sec
Episode 2518, loss: -0.2058, total reward: 0.1100, 0.10803747177124023 sec
Episode 2519, loss: 156.0915, total reward: 10.2500, 0.060132503509521484 sec
Episode 2520, loss: 0.9674, total reward: 0.2200, 0.10087990760803223 sec
Episode 2521, loss: 1.4612, total reward: 0.1100, 0.10390353202819824 sec
Episode 2522, loss: 2.0754, total reward: 0.3200, 0.1047213077545166 sec
Episode 2523, loss: 3.1712, total reward: 0.1100, 0.10566091537475586 sec
Episode 2524, loss: 264.4638, total reward: 10.6300, 0.10515403747558594 sec
Episode 2525, loss: 251.8540, total reward: 10.3700, 0.08835911750793457 sec
Episode 2526, l

Episode 2622, loss: 241.4633, total reward: 10.1800, 0.07953429222106934 sec
Episode 2623, loss: 183.8013, total reward: 10.4100, 0.07363748550415039 sec
Episode 2624, loss: 7.1086, total reward: 0.8300, 0.10288429260253906 sec
Episode 2625, loss: 246.4629, total reward: 10.2400, 0.0966482162475586 sec
Episode 2626, loss: 154.7469, total reward: 9.9600, 0.05294919013977051 sec
Episode 2627, loss: 211.8220, total reward: 10.0100, 0.06232476234436035 sec
Episode 2628, loss: 202.2910, total reward: 10.3400, 0.09497380256652832 sec
Episode 2629, loss: 237.8332, total reward: 10.4700, 0.08911728858947754 sec
Episode 2630, loss: 251.3912, total reward: 10.1500, 0.09151387214660645 sec
Episode 2631, loss: 180.3651, total reward: 10.2300, 0.06571006774902344 sec
Episode 2632, loss: 199.8808, total reward: 10.2700, 0.08466839790344238 sec
Episode 2633, loss: -3.0148, total reward: -0.0200, 0.10223197937011719 sec
Episode 2634, loss: 2.2219, total reward: 0.2200, 0.10904884338378906 sec
Episode 

Episode 2731, loss: 4.8387, total reward: 0.4200, 0.10483837127685547 sec
Episode 2732, loss: 2.3743, total reward: 0.2200, 0.10782361030578613 sec
Episode 2733, loss: 170.8519, total reward: 10.3400, 0.06613349914550781 sec
Episode 2734, loss: 5.3806, total reward: 0.3200, 0.10436558723449707 sec
Episode 2735, loss: 173.2424, total reward: 10.6100, 0.07850766181945801 sec
Episode 2736, loss: 155.3457, total reward: 10.2500, 0.06876325607299805 sec
Episode 2737, loss: 105.9849, total reward: 10.4000, 0.04355597496032715 sec
Episode 2738, loss: 213.8627, total reward: 10.3400, 0.09592390060424805 sec
Episode 2739, loss: 229.0013, total reward: 10.5100, 0.07483220100402832 sec
Episode 2740, loss: 2.9660, total reward: 0.4200, 0.10350203514099121 sec
Episode 2741, loss: -3.9951, total reward: -1.0000, 0.009029626846313477 sec
Episode 2742, loss: 199.5470, total reward: 10.4300, 0.06276655197143555 sec
Episode 2743, loss: 1.6045, total reward: 0.2200, 0.10064053535461426 sec
Episode 2744, 

Episode 2839, loss: 3.0122, total reward: 0.4200, 0.11063194274902344 sec
Episode 2840, loss: 105.6794, total reward: 10.2100, 0.04095578193664551 sec
Episode 2841, loss: 3.1991, total reward: 0.2200, 0.09952545166015625 sec
Episode 2842, loss: 197.8234, total reward: 10.3700, 0.0877077579498291 sec
Episode 2843, loss: 210.3647, total reward: 10.5200, 0.0723104476928711 sec
Episode 2844, loss: 206.6783, total reward: 10.2800, 0.08134865760803223 sec
Episode 2845, loss: 5.9145, total reward: 0.3200, 0.10330009460449219 sec
Episode 2846, loss: 185.7578, total reward: 10.2500, 0.05951738357543945 sec
Episode 2847, loss: 199.8244, total reward: 10.6100, 0.07408523559570312 sec
Episode 2848, loss: 5.2496, total reward: 0.5200, 0.10333418846130371 sec
Episode 2849, loss: 10.4921, total reward: 0.7300, 0.10600876808166504 sec
Episode 2850, loss: 130.5022, total reward: 10.0900, 0.045496225357055664 sec
Episode 2851, loss: 6.0550, total reward: 0.5200, 0.10142970085144043 sec
Episode 2852, los

Episode 2947, loss: 1.3787, total reward: 0.4200, 0.1017920970916748 sec
Episode 2948, loss: 169.8135, total reward: 10.2800, 0.049890995025634766 sec
Episode 2949, loss: 199.5264, total reward: 10.2400, 0.09260702133178711 sec
Episode 2950, loss: -0.6684, total reward: 0.1100, 0.1036989688873291 sec
Episode 2951, loss: -16.5181, total reward: -0.6000, 0.07924461364746094 sec
Episode 2952, loss: 227.1981, total reward: 10.2400, 0.0892181396484375 sec
Episode 2953, loss: 6.1093, total reward: 0.4300, 0.10444474220275879 sec
Episode 2954, loss: 184.8198, total reward: 10.3300, 0.06768107414245605 sec
Episode 2955, loss: 226.8829, total reward: 10.2700, 0.08691787719726562 sec
Episode 2956, loss: 153.8558, total reward: 10.3500, 0.060453176498413086 sec
Episode 2957, loss: 252.5345, total reward: 10.1500, 0.0891885757446289 sec
Episode 2958, loss: 197.4700, total reward: 10.1800, 0.08224892616271973 sec
Episode 2959, loss: 167.3709, total reward: 10.4900, 0.0482025146484375 sec
Episode 29

Episode 3054, loss: 211.0962, total reward: 10.5700, 0.09419059753417969 sec
Episode 3055, loss: 3.7557, total reward: 0.2200, 0.10697627067565918 sec
Episode 3056, loss: 189.7061, total reward: 10.2600, 0.09288930892944336 sec
Episode 3057, loss: 162.1729, total reward: 10.4300, 0.0692451000213623 sec
Episode 3058, loss: 148.9729, total reward: 10.2300, 0.06718206405639648 sec
Episode 3059, loss: 222.3666, total reward: 10.0900, 0.09280562400817871 sec
Episode 3060, loss: 180.5234, total reward: 10.1100, 0.07251715660095215 sec
Episode 3061, loss: -1.7010, total reward: 0.0100, 0.10929560661315918 sec
Episode 3062, loss: 189.5269, total reward: 10.2500, 0.08909940719604492 sec
Episode 3063, loss: 4.2644, total reward: 0.3200, 0.10629391670227051 sec
Episode 3064, loss: 246.8814, total reward: 10.2400, 0.10084366798400879 sec
Episode 3065, loss: 1.0541, total reward: 0.3200, 0.10434961318969727 sec
Episode 3066, loss: 171.4392, total reward: 10.3800, 0.08573460578918457 sec
Episode 306

Episode 3163, loss: 194.9928, total reward: 10.3700, 0.07591414451599121 sec
Episode 3164, loss: 184.2975, total reward: 10.6200, 0.0725259780883789 sec
Episode 3165, loss: 169.2945, total reward: 10.5500, 0.06062150001525879 sec
Episode 3166, loss: 8.9646, total reward: 0.6300, 0.10211825370788574 sec
Episode 3167, loss: -2.3302, total reward: 0.1100, 0.10364890098571777 sec
Episode 3168, loss: 218.7013, total reward: 10.7300, 0.10855889320373535 sec
Episode 3169, loss: 6.1657, total reward: 0.4200, 0.10832548141479492 sec
Episode 3170, loss: 152.1751, total reward: 10.4400, 0.06751585006713867 sec
Episode 3171, loss: 69.3687, total reward: 10.1600, 0.022042512893676758 sec
Episode 3172, loss: 143.8674, total reward: 10.5700, 0.05282282829284668 sec
Episode 3173, loss: 157.3825, total reward: 10.2400, 0.06531858444213867 sec
Episode 3174, loss: 179.1009, total reward: 10.7000, 0.08171534538269043 sec
Episode 3175, loss: 5.7364, total reward: 0.6300, 0.10667634010314941 sec
Episode 317

Episode 3272, loss: 3.9340, total reward: 0.5300, 0.10452914237976074 sec
Episode 3273, loss: 3.0602, total reward: 0.3200, 0.10432720184326172 sec
Episode 3274, loss: 228.4427, total reward: 10.5800, 0.08733510971069336 sec
Episode 3275, loss: 99.3315, total reward: 10.1200, 0.035050392150878906 sec
Episode 3276, loss: 3.6456, total reward: 0.4200, 0.0993192195892334 sec
Episode 3277, loss: 10.8054, total reward: 0.6300, 0.10494232177734375 sec
Episode 3278, loss: 12.1270, total reward: 0.6300, 0.10577869415283203 sec
Episode 3279, loss: 5.7352, total reward: 0.5200, 0.10611772537231445 sec
Episode 3280, loss: 127.3893, total reward: 10.1900, 0.04661297798156738 sec
Episode 3281, loss: 3.2679, total reward: 0.2200, 0.10019040107727051 sec
Episode 3282, loss: 194.0321, total reward: 10.7700, 0.09250617027282715 sec
Episode 3283, loss: 137.8585, total reward: 10.2200, 0.03643393516540527 sec
Episode 3284, loss: 5.1609, total reward: 0.5200, 0.1000831127166748 sec
Episode 3285, loss: 1.5

Episode 3380, loss: 5.9664, total reward: 0.4200, 0.10162544250488281 sec
Episode 3381, loss: 133.3600, total reward: 10.7000, 0.04799461364746094 sec
Episode 3382, loss: 182.4396, total reward: 10.2900, 0.07670354843139648 sec
Episode 3383, loss: 205.4784, total reward: 10.5500, 0.09433794021606445 sec
Episode 3384, loss: 215.7177, total reward: 10.3200, 0.10418510437011719 sec
Episode 3385, loss: 252.2979, total reward: 10.6000, 0.08093142509460449 sec
Episode 3386, loss: 4.8244, total reward: 0.5200, 0.10405325889587402 sec
Episode 3387, loss: 8.7296, total reward: 0.7300, 0.1065816879272461 sec
Episode 3388, loss: 163.5694, total reward: 10.2500, 0.06022191047668457 sec
Episode 3389, loss: 210.6421, total reward: 10.3900, 0.07825756072998047 sec
Episode 3390, loss: 102.8102, total reward: 10.2200, 0.03558802604675293 sec
Episode 3391, loss: 180.5288, total reward: 10.6900, 0.07972455024719238 sec
Episode 3392, loss: 180.9041, total reward: 10.1200, 0.06830811500549316 sec
Episode 3

Episode 3488, loss: 3.4510, total reward: 0.3200, 0.10929703712463379 sec
Episode 3489, loss: 199.0506, total reward: 10.1700, 0.08956170082092285 sec
Episode 3490, loss: -0.6171, total reward: 0.2200, 0.10670971870422363 sec
Episode 3491, loss: 238.8881, total reward: 10.4600, 0.09398150444030762 sec
Episode 3492, loss: 5.1733, total reward: 0.4200, 0.1064140796661377 sec
Episode 3493, loss: 224.0412, total reward: 10.7800, 0.09857177734375 sec
Episode 3494, loss: 215.0941, total reward: 10.3300, 0.10359430313110352 sec
Episode 3495, loss: 198.3176, total reward: 10.6100, 0.07408857345581055 sec
Episode 3496, loss: 225.0914, total reward: 10.2500, 0.09261131286621094 sec
Episode 3497, loss: 206.7592, total reward: 10.6700, 0.09705090522766113 sec
Episode 3498, loss: 4.4115, total reward: 0.4200, 0.1049034595489502 sec
Episode 3499, loss: 187.7836, total reward: 10.6400, 0.06794953346252441 sec
Episode 3500, loss: 192.9774, total reward: 10.3100, 0.07215547561645508 sec
Episode 3501, l

Episode 3596, loss: 3.9027, total reward: 0.4200, 0.11233043670654297 sec
Episode 3597, loss: 9.2873, total reward: 0.4200, 0.11251688003540039 sec
Episode 3598, loss: 171.3715, total reward: 10.4900, 0.07738208770751953 sec
Episode 3599, loss: 163.8076, total reward: 10.2200, 0.07181882858276367 sec
Episode 3600, loss: 8.2169, total reward: 0.7300, 0.10540246963500977 sec
Episode 3601, loss: 160.5509, total reward: 10.2100, 0.040303707122802734 sec
Episode 3602, loss: -1.1804, total reward: 0.0600, 0.09905004501342773 sec
Episode 3603, loss: 7.6923, total reward: 0.5200, 0.10535812377929688 sec
Episode 3604, loss: 187.7291, total reward: 10.5200, 0.09630203247070312 sec
Episode 3605, loss: 206.3093, total reward: 10.3600, 0.09119939804077148 sec
Episode 3606, loss: 105.8742, total reward: 10.1900, 0.045397281646728516 sec
Episode 3607, loss: 6.2929, total reward: 0.6300, 0.10078668594360352 sec
Episode 3608, loss: 6.0392, total reward: 0.6300, 0.1052558422088623 sec
Episode 3609, loss

Episode 3705, loss: 2.0088, total reward: 0.4200, 0.09647536277770996 sec
Episode 3706, loss: 9.0550, total reward: 0.8300, 0.10126137733459473 sec
Episode 3707, loss: 1.8476, total reward: 0.2900, 0.09880805015563965 sec
Episode 3708, loss: 252.0136, total reward: 10.3100, 0.09834885597229004 sec
Episode 3709, loss: 180.8436, total reward: 10.5200, 0.0691835880279541 sec
Episode 3710, loss: -2.0642, total reward: 0.3200, 0.10081148147583008 sec
Episode 3711, loss: 238.3596, total reward: 10.4300, 0.09630060195922852 sec
Episode 3712, loss: 209.1168, total reward: 10.0300, 0.07352924346923828 sec
Episode 3713, loss: 202.2330, total reward: 10.3900, 0.07613849639892578 sec
Episode 3714, loss: 122.1028, total reward: 10.1200, 0.03204679489135742 sec
Episode 3715, loss: 5.1816, total reward: 0.5200, 0.09523200988769531 sec
Episode 3716, loss: 167.0712, total reward: 10.1500, 0.0545806884765625 sec
Episode 3717, loss: 208.3132, total reward: 10.6600, 0.08637166023254395 sec
Episode 3718, l

Episode 3815, loss: 2.9304, total reward: 0.3200, 0.09686613082885742 sec
Episode 3816, loss: 90.3329, total reward: 10.4100, 0.0381312370300293 sec
Episode 3817, loss: 178.7836, total reward: 10.4200, 0.06253862380981445 sec
Episode 3818, loss: 1.9175, total reward: 0.4200, 0.09571242332458496 sec
Episode 3819, loss: 4.3829, total reward: 0.4100, 0.09840607643127441 sec
Episode 3820, loss: 182.5044, total reward: 10.5400, 0.0950009822845459 sec
Episode 3821, loss: 6.1477, total reward: 0.4200, 0.11040067672729492 sec
Episode 3822, loss: -2.5258, total reward: 0.0100, 0.09859180450439453 sec
Episode 3823, loss: 198.9640, total reward: 10.1200, 0.09724259376525879 sec
Episode 3824, loss: 205.9813, total reward: 10.4400, 0.09341859817504883 sec
Episode 3825, loss: 4.6186, total reward: 0.4200, 0.09860825538635254 sec
Episode 3826, loss: 3.8097, total reward: 0.3200, 0.10286688804626465 sec
Episode 3827, loss: -12.4350, total reward: -0.7000, 0.04130244255065918 sec
Episode 3828, loss: -0

Episode 3924, loss: 193.3627, total reward: 10.5700, 0.08358287811279297 sec
Episode 3925, loss: 193.8236, total reward: 10.6800, 0.08291196823120117 sec
Episode 3926, loss: 167.3044, total reward: 10.5400, 0.0933845043182373 sec
Episode 3927, loss: 155.3835, total reward: 10.5000, 0.0865781307220459 sec
Episode 3928, loss: 4.7379, total reward: 0.6300, 0.09823036193847656 sec
Episode 3929, loss: 178.1147, total reward: 10.7000, 0.07664227485656738 sec
Episode 3930, loss: 162.5475, total reward: 10.2000, 0.07037639617919922 sec
Episode 3931, loss: 204.7122, total reward: 10.5500, 0.08914923667907715 sec
Episode 3932, loss: 1.1770, total reward: 0.3700, 0.09902286529541016 sec
Episode 3933, loss: 0.6305, total reward: 0.3200, 0.10030579566955566 sec
Episode 3934, loss: 179.6869, total reward: 10.5300, 0.09859824180603027 sec
Episode 3935, loss: 197.9001, total reward: 10.5500, 0.09161996841430664 sec
Episode 3936, loss: 154.5603, total reward: 10.3900, 0.045007944107055664 sec
Episode 3

Episode 4032, loss: 2.9205, total reward: 0.6300, 0.09637045860290527 sec
Episode 4033, loss: 163.1954, total reward: 10.3400, 0.05942535400390625 sec
Episode 4034, loss: 171.9208, total reward: 10.6000, 0.07349205017089844 sec
Episode 4035, loss: -0.1691, total reward: 0.1100, 0.09730982780456543 sec
Episode 4036, loss: -0.6738, total reward: 0.1100, 0.09921526908874512 sec
Episode 4037, loss: 172.9331, total reward: 10.2100, 0.0684974193572998 sec
Episode 4038, loss: 220.1146, total reward: 10.0200, 0.09557509422302246 sec
Episode 4039, loss: 6.7266, total reward: 0.5200, 0.09995126724243164 sec
Episode 4040, loss: 1.2004, total reward: 0.3200, 0.10212111473083496 sec
Episode 4041, loss: 129.4723, total reward: 10.2200, 0.06585884094238281 sec
Episode 4042, loss: 156.7087, total reward: 10.4000, 0.07246661186218262 sec
Episode 4043, loss: 240.4431, total reward: 10.4900, 0.0761723518371582 sec
Episode 4044, loss: 221.0027, total reward: 10.5300, 0.0967254638671875 sec
Episode 4045, l

Episode 4142, loss: 191.5558, total reward: 10.3200, 0.0977632999420166 sec
Episode 4143, loss: 2.4241, total reward: 0.3200, 0.09853553771972656 sec
Episode 4144, loss: 44.3717, total reward: 9.9500, 0.02258610725402832 sec
Episode 4145, loss: 0.2854, total reward: -0.0900, 0.09189200401306152 sec
Episode 4146, loss: 219.0234, total reward: 10.6300, 0.09788942337036133 sec
Episode 4147, loss: 129.9440, total reward: 10.1400, 0.05795121192932129 sec
Episode 4148, loss: 4.8226, total reward: 0.4200, 0.09600067138671875 sec
Episode 4149, loss: 169.2558, total reward: 10.2700, 0.049918413162231445 sec
Episode 4150, loss: 3.8887, total reward: 0.5200, 0.09639859199523926 sec
Episode 4151, loss: 167.1951, total reward: 10.3800, 0.07969522476196289 sec
Episode 4152, loss: 174.2681, total reward: 10.3400, 0.09381699562072754 sec
Episode 4153, loss: 107.0061, total reward: 10.4700, 0.05162239074707031 sec
Episode 4154, loss: 5.2413, total reward: 0.5200, 0.09626126289367676 sec
Episode 4155, l

Episode 4252, loss: 55.1305, total reward: 10.3300, 0.03008294105529785 sec
Episode 4253, loss: 192.0854, total reward: 10.4300, 0.0910491943359375 sec
Episode 4254, loss: 89.1605, total reward: 10.2400, 0.027583837509155273 sec
Episode 4255, loss: 217.0595, total reward: 10.2600, 0.07921695709228516 sec
Episode 4256, loss: 233.6208, total reward: 10.3600, 0.08481073379516602 sec
Episode 4257, loss: 149.4716, total reward: 10.2300, 0.061130523681640625 sec
Episode 4258, loss: 125.9351, total reward: 10.4600, 0.05274081230163574 sec
Episode 4259, loss: 175.7997, total reward: 10.4800, 0.07734560966491699 sec
Episode 4260, loss: 204.2391, total reward: 10.2300, 0.06090092658996582 sec
Episode 4261, loss: 175.9359, total reward: 10.4600, 0.07786083221435547 sec
Episode 4262, loss: 174.2689, total reward: 10.3900, 0.07797503471374512 sec
Episode 4263, loss: 204.0076, total reward: 10.4500, 0.08816313743591309 sec
Episode 4264, loss: -2.3644, total reward: 0.0100, 0.09705638885498047 sec
Ep

Episode 4361, loss: 112.4778, total reward: 10.2600, 0.04994463920593262 sec
Episode 4362, loss: 142.1087, total reward: 10.3300, 0.05975508689880371 sec
Episode 4363, loss: 6.0126, total reward: 0.5200, 0.09785628318786621 sec
Episode 4364, loss: -0.7874, total reward: 0.2200, 0.09830522537231445 sec
Episode 4365, loss: 91.1395, total reward: 10.1300, 0.030254125595092773 sec
Episode 4366, loss: 134.3863, total reward: 10.5200, 0.060750484466552734 sec
Episode 4367, loss: 3.1603, total reward: 0.2200, 0.09554243087768555 sec
Episode 4368, loss: 105.1457, total reward: 10.2000, 0.03958725929260254 sec
Episode 4369, loss: 38.4677, total reward: 10.2400, 0.024840354919433594 sec
Episode 4370, loss: 152.8832, total reward: 10.5000, 0.06996846199035645 sec
Episode 4371, loss: 129.4800, total reward: 10.1200, 0.06195831298828125 sec
Episode 4372, loss: 0.6226, total reward: -0.0900, 0.09548807144165039 sec
Episode 4373, loss: 3.3314, total reward: 0.6300, 0.09918069839477539 sec
Episode 437

Episode 4471, loss: 185.0926, total reward: 10.5100, 0.0691065788269043 sec
Episode 4472, loss: 130.0090, total reward: 10.4600, 0.052756547927856445 sec
Episode 4473, loss: 248.5240, total reward: 10.7600, 0.0875544548034668 sec
Episode 4474, loss: 196.1731, total reward: 10.1600, 0.08315110206604004 sec
Episode 4475, loss: 3.7988, total reward: 0.3200, 0.09773612022399902 sec
Episode 4476, loss: 167.9945, total reward: 10.3300, 0.06340575218200684 sec
Episode 4477, loss: 232.0796, total reward: 10.5600, 0.08594369888305664 sec
Episode 4478, loss: 191.2326, total reward: 10.5600, 0.08739209175109863 sec
Episode 4479, loss: 6.4099, total reward: 0.6300, 0.09966182708740234 sec
Episode 4480, loss: 6.1662, total reward: 0.8300, 0.10052657127380371 sec
Episode 4481, loss: 157.4177, total reward: 10.5300, 0.06570553779602051 sec
Episode 4482, loss: 101.8717, total reward: 10.3800, 0.046142578125 sec
Episode 4483, loss: 195.0903, total reward: 10.4400, 0.08989739418029785 sec
Episode 4484, 

Episode 4579, loss: 4.0983, total reward: 0.3200, 0.097747802734375 sec
Episode 4580, loss: 4.4027, total reward: 0.4200, 0.09888887405395508 sec
Episode 4581, loss: 143.1021, total reward: 10.2200, 0.06612062454223633 sec
Episode 4582, loss: 154.3934, total reward: 10.0900, 0.07117319107055664 sec
Episode 4583, loss: 169.5339, total reward: 10.2500, 0.08623647689819336 sec
Episode 4584, loss: 210.7049, total reward: 10.5600, 0.08913302421569824 sec
Episode 4585, loss: 110.5433, total reward: 10.4500, 0.058267831802368164 sec
Episode 4586, loss: 165.9526, total reward: 10.2100, 0.06535792350769043 sec
Episode 4587, loss: 159.8624, total reward: 10.0500, 0.08382320404052734 sec
Episode 4588, loss: 6.1292, total reward: 0.6300, 0.09823942184448242 sec
Episode 4589, loss: 4.7882, total reward: 0.4000, 0.09894776344299316 sec
Episode 4590, loss: 93.5426, total reward: 10.0600, 0.07577729225158691 sec
Episode 4591, loss: 156.5936, total reward: 10.4800, 0.07765841484069824 sec
Episode 4592,

Episode 4688, loss: 1.0518, total reward: 0.4000, 0.09819364547729492 sec
Episode 4689, loss: 0.1319, total reward: 0.2200, 0.09834623336791992 sec
Episode 4690, loss: 1.6813, total reward: 0.2200, 0.09891748428344727 sec
Episode 4691, loss: 144.2585, total reward: 10.1900, 0.07454276084899902 sec
Episode 4692, loss: 3.6750, total reward: 0.4200, 0.09736132621765137 sec
Episode 4693, loss: 1.5875, total reward: 0.3200, 0.09908699989318848 sec
Episode 4694, loss: 1.6528, total reward: 0.2200, 0.09846735000610352 sec
Episode 4695, loss: 0.9630, total reward: 0.3200, 0.09883761405944824 sec
Episode 4696, loss: 19.3429, total reward: 10.0700, 0.018357515335083008 sec
Episode 4697, loss: 170.9427, total reward: 10.3600, 0.07933735847473145 sec
Episode 4698, loss: 2.3637, total reward: 0.5200, 0.0982520580291748 sec
Episode 4699, loss: 26.5681, total reward: 10.0600, 0.020822525024414062 sec
Episode 4700, loss: 157.9324, total reward: 10.3400, 0.08626461029052734 sec
Episode 4701, loss: 7.74

Episode 4797, loss: 109.3511, total reward: 10.1000, 0.06254315376281738 sec
Episode 4798, loss: 76.2217, total reward: 10.5700, 0.050199031829833984 sec
Episode 4799, loss: 143.0191, total reward: 10.3400, 0.05691337585449219 sec
Episode 4800, loss: 6.5826, total reward: 0.6200, 0.09630584716796875 sec
Episode 4801, loss: 133.9697, total reward: 10.3400, 0.05932116508483887 sec
Episode 4802, loss: 145.4473, total reward: 10.4600, 0.08372306823730469 sec
Episode 4803, loss: 64.1859, total reward: 10.3500, 0.05597329139709473 sec
Episode 4804, loss: 111.9706, total reward: 10.6200, 0.0660703182220459 sec
Episode 4805, loss: 128.4474, total reward: 10.4400, 0.09241080284118652 sec
Episode 4806, loss: 94.7950, total reward: 10.5700, 0.053287506103515625 sec
Episode 4807, loss: 1.0414, total reward: 0.2200, 0.09504222869873047 sec
Episode 4808, loss: 138.9887, total reward: 10.6600, 0.08903288841247559 sec
Episode 4809, loss: 5.9086, total reward: 0.6300, 0.0988616943359375 sec
Episode 481

Episode 4907, loss: 152.5558, total reward: 10.4800, 0.07764744758605957 sec
Episode 4908, loss: 3.6767, total reward: 0.6300, 0.09833955764770508 sec
Episode 4909, loss: 89.8008, total reward: 10.3400, 0.05968594551086426 sec
Episode 4910, loss: 151.9837, total reward: 10.3500, 0.08609461784362793 sec
Episode 4911, loss: 182.1214, total reward: 9.9600, 0.08081674575805664 sec
Episode 4912, loss: 206.9860, total reward: 10.1000, 0.0707406997680664 sec
Episode 4913, loss: 72.1840, total reward: 10.2700, 0.048656463623046875 sec
Episode 4914, loss: 106.2877, total reward: 10.5200, 0.06473469734191895 sec
Episode 4915, loss: 6.0762, total reward: 0.4200, 0.09697580337524414 sec
Episode 4916, loss: 109.0087, total reward: 10.7000, 0.07627344131469727 sec
Episode 4917, loss: 143.3575, total reward: 10.4500, 0.08808016777038574 sec
Episode 4918, loss: 2.4455, total reward: 0.4200, 0.09800457954406738 sec
Episode 4919, loss: 1.5299, total reward: 0.7300, 0.09971356391906738 sec
Episode 4920, 

Episode 5015, loss: 2.8113, total reward: 0.4200, 0.09838461875915527 sec
Episode 5016, loss: 176.1065, total reward: 10.2200, 0.06605386734008789 sec
Episode 5017, loss: 142.0144, total reward: 10.2000, 0.07028627395629883 sec
Episode 5018, loss: 90.7309, total reward: 10.2500, 0.08583641052246094 sec
Episode 5019, loss: 109.3042, total reward: 10.6700, 0.08550786972045898 sec
Episode 5020, loss: 141.4100, total reward: 10.4100, 0.06937813758850098 sec
Episode 5021, loss: 109.9831, total reward: 10.8800, 0.08851432800292969 sec
Episode 5022, loss: 2.0070, total reward: 0.4200, 0.09755277633666992 sec
Episode 5023, loss: 63.9416, total reward: 10.2600, 0.053179264068603516 sec
Episode 5024, loss: 0.9599, total reward: 0.4200, 0.09550046920776367 sec
Episode 5025, loss: 3.4273, total reward: 0.4200, 0.0986471176147461 sec
Episode 5026, loss: 131.9896, total reward: 10.2700, 0.08183670043945312 sec
Episode 5027, loss: 168.1291, total reward: 10.4200, 0.0956571102142334 sec
Episode 5028, 

Episode 5124, loss: -0.2260, total reward: 0.1100, 0.09246349334716797 sec
Episode 5125, loss: 1.5870, total reward: 0.2200, 0.09763216972351074 sec
Episode 5126, loss: -0.7538, total reward: 0.2200, 0.09823036193847656 sec
Episode 5127, loss: 1.3718, total reward: 0.2200, 0.09888410568237305 sec
Episode 5128, loss: -0.8248, total reward: 0.1100, 0.09907126426696777 sec
Episode 5129, loss: 3.1468, total reward: 0.3200, 0.0984506607055664 sec
Episode 5130, loss: -1.9625, total reward: -0.0900, 0.09813666343688965 sec
Episode 5131, loss: -1.5818, total reward: -0.0900, 0.09775424003601074 sec
Episode 5132, loss: -0.3237, total reward: 0.3000, 0.10253119468688965 sec
Episode 5133, loss: 109.9655, total reward: 10.3000, 0.0725405216217041 sec
Episode 5134, loss: -0.5900, total reward: 0.1100, 0.09700489044189453 sec
Episode 5135, loss: 3.3314, total reward: 0.6300, 0.10011172294616699 sec
Episode 5136, loss: 2.4549, total reward: 0.4200, 0.09914588928222656 sec
Episode 5137, loss: 0.4685, 

Episode 5233, loss: 84.9894, total reward: 10.0600, 0.05096721649169922 sec
Episode 5234, loss: 140.9994, total reward: 10.3800, 0.07639503479003906 sec
Episode 5235, loss: 74.1407, total reward: 10.1700, 0.04802346229553223 sec
Episode 5236, loss: 49.6049, total reward: 10.2700, 0.04636335372924805 sec
Episode 5237, loss: 72.6013, total reward: 10.1600, 0.08042025566101074 sec
Episode 5238, loss: 100.3156, total reward: 10.2600, 0.08311796188354492 sec
Episode 5239, loss: 1.1583, total reward: 0.3200, 0.0975046157836914 sec
Episode 5240, loss: 142.7972, total reward: 10.4000, 0.07333588600158691 sec
Episode 5241, loss: 5.2633, total reward: 0.8300, 0.09826159477233887 sec
Episode 5242, loss: 2.9816, total reward: 0.6300, 0.10234236717224121 sec
Episode 5243, loss: 77.5412, total reward: 10.3500, 0.09041380882263184 sec
Episode 5244, loss: 87.2252, total reward: 9.9700, 0.0785984992980957 sec
Episode 5245, loss: 1.3422, total reward: 0.2800, 0.09732842445373535 sec
Episode 5246, loss: 

Episode 5343, loss: 4.0893, total reward: 0.3200, 0.09876322746276855 sec
Episode 5344, loss: 3.0579, total reward: 0.6300, 0.09984922409057617 sec
Episode 5345, loss: 104.5358, total reward: 10.1400, 0.06175947189331055 sec
Episode 5346, loss: 1.8016, total reward: 0.2200, 0.09552192687988281 sec
Episode 5347, loss: 4.1499, total reward: 0.4200, 0.09890556335449219 sec
Episode 5348, loss: -1.1363, total reward: -0.0900, 0.09757590293884277 sec
Episode 5349, loss: 107.6447, total reward: 10.3100, 0.03841686248779297 sec
Episode 5350, loss: 2.6547, total reward: 0.4200, 0.09343266487121582 sec
Episode 5351, loss: 32.5056, total reward: 10.3200, 0.034738779067993164 sec
Episode 5352, loss: -1.4546, total reward: 0.0100, 0.09198784828186035 sec
Episode 5353, loss: -0.8330, total reward: 0.0100, 0.09704256057739258 sec
Episode 5354, loss: -10.8412, total reward: -0.8500, 0.023834228515625 sec
Episode 5355, loss: 151.0362, total reward: 10.2600, 0.04780101776123047 sec
Episode 5356, loss: 1

Episode 5454, loss: 1.0776, total reward: 0.3200, 0.09878802299499512 sec
Episode 5455, loss: 1.6531, total reward: 0.5200, 0.10022306442260742 sec
Episode 5456, loss: 6.2025, total reward: 0.8300, 0.1003570556640625 sec
Episode 5457, loss: 0.9722, total reward: 0.4900, 0.10014009475708008 sec
Episode 5458, loss: 67.0297, total reward: 10.5800, 0.08234024047851562 sec
Episode 5459, loss: 0.0144, total reward: 0.2200, 0.10215997695922852 sec
Episode 5460, loss: 3.1694, total reward: 0.4200, 0.09876704216003418 sec
Episode 5461, loss: 4.7182, total reward: 0.5200, 0.09975528717041016 sec
Episode 5462, loss: -0.1199, total reward: -0.0900, 0.0980527400970459 sec
Episode 5463, loss: 81.8660, total reward: 9.9400, 0.08861947059631348 sec
Episode 5464, loss: -0.3425, total reward: 0.0200, 0.09831833839416504 sec
Episode 5465, loss: 78.2947, total reward: 10.3900, 0.07702231407165527 sec
Episode 5466, loss: 1.8457, total reward: 0.5200, 0.09901952743530273 sec
Episode 5467, loss: 53.0515, tot

Episode 5565, loss: 1.0159, total reward: 0.3200, 0.10021543502807617 sec
Episode 5566, loss: 2.5250, total reward: 0.7300, 0.10129904747009277 sec
Episode 5567, loss: 0.6899, total reward: 0.2200, 0.09925127029418945 sec
Episode 5568, loss: 86.5492, total reward: 10.4600, 0.09126138687133789 sec
Episode 5569, loss: 94.7380, total reward: 10.2200, 0.09681320190429688 sec
Episode 5570, loss: 2.1809, total reward: 0.4200, 0.09870028495788574 sec
Episode 5571, loss: 63.4458, total reward: 10.1400, 0.0278627872467041 sec
Episode 5572, loss: 0.9672, total reward: 0.2200, 0.09260368347167969 sec
Episode 5573, loss: 1.0005, total reward: 0.3200, 0.09907913208007812 sec
Episode 5574, loss: 0.6171, total reward: 0.2200, 0.09842944145202637 sec
Episode 5575, loss: 39.5482, total reward: 10.3200, 0.03533220291137695 sec
Episode 5576, loss: 80.3071, total reward: 10.3300, 0.09072732925415039 sec
Episode 5577, loss: 110.8722, total reward: 10.3900, 0.07713556289672852 sec
Episode 5578, loss: -0.432

Episode 5675, loss: 3.7049, total reward: 0.4200, 0.09908485412597656 sec
Episode 5676, loss: 2.8431, total reward: 0.4200, 0.10217738151550293 sec
Episode 5677, loss: 1.8809, total reward: 0.4200, 0.09880924224853516 sec
Episode 5678, loss: 3.1279, total reward: 0.6300, 0.09994220733642578 sec
Episode 5679, loss: 126.8444, total reward: 10.0000, 0.07016205787658691 sec
Episode 5680, loss: 3.9842, total reward: 0.5200, 0.09756803512573242 sec
Episode 5681, loss: 2.0894, total reward: 0.2200, 0.09936881065368652 sec
Episode 5682, loss: 0.0694, total reward: 0.1100, 0.09896421432495117 sec
Episode 5683, loss: 2.4051, total reward: 0.4200, 0.09937477111816406 sec
Episode 5684, loss: 1.6202, total reward: 0.5200, 0.09991240501403809 sec
Episode 5685, loss: 175.9305, total reward: 10.7500, 0.0942387580871582 sec
Episode 5686, loss: 3.1714, total reward: 0.4200, 0.09895062446594238 sec
Episode 5687, loss: 82.5043, total reward: 9.9900, 0.07290053367614746 sec
Episode 5688, loss: 3.2805, tota

Episode 5786, loss: 114.6961, total reward: 9.9200, 0.09019184112548828 sec
Episode 5787, loss: 110.9949, total reward: 10.2500, 0.08753442764282227 sec
Episode 5788, loss: 3.0388, total reward: 0.3000, 0.09829354286193848 sec
Episode 5789, loss: 0.6455, total reward: 0.1100, 0.09936380386352539 sec
Episode 5790, loss: 42.9886, total reward: 10.2100, 0.03460574150085449 sec
Episode 5791, loss: -0.0242, total reward: 0.3200, 0.09294319152832031 sec
Episode 5792, loss: 2.5131, total reward: 0.3200, 0.09797048568725586 sec
Episode 5793, loss: 4.0978, total reward: 0.5200, 0.09942483901977539 sec
Episode 5794, loss: 1.8134, total reward: 0.2200, 0.09935522079467773 sec
Episode 5795, loss: 0.8783, total reward: 0.4200, 0.0991668701171875 sec
Episode 5796, loss: 115.8918, total reward: 10.3800, 0.08011603355407715 sec
Episode 5797, loss: 2.1224, total reward: 0.5200, 0.10318565368652344 sec
Episode 5798, loss: 83.5201, total reward: 10.2400, 0.05985403060913086 sec
Episode 5799, loss: 60.532

Episode 5897, loss: 73.6276, total reward: 10.5200, 0.0690145492553711 sec
Episode 5898, loss: -0.4640, total reward: 0.3200, 0.0961446762084961 sec
Episode 5899, loss: 1.7753, total reward: 0.4600, 0.09980344772338867 sec
Episode 5900, loss: 138.6143, total reward: 10.7300, 0.09988141059875488 sec
Episode 5901, loss: 103.8087, total reward: 10.6000, 0.07609963417053223 sec
Episode 5902, loss: 145.2061, total reward: 10.1700, 0.07911419868469238 sec
Episode 5903, loss: 140.1925, total reward: 10.8000, 0.07663726806640625 sec
Episode 5904, loss: 96.3031, total reward: 10.7000, 0.07501411437988281 sec
Episode 5905, loss: 0.0886, total reward: 0.0100, 0.09646391868591309 sec
Episode 5906, loss: 84.0000, total reward: 10.0400, 0.06300520896911621 sec
Episode 5907, loss: 120.0996, total reward: 10.4400, 0.09108495712280273 sec
Episode 5908, loss: 2.4593, total reward: 0.4200, 0.09919500350952148 sec
Episode 5909, loss: 44.2118, total reward: 9.9900, 0.03988027572631836 sec
Episode 5910, los

Episode 6005, loss: 112.3565, total reward: 10.2700, 0.08234310150146484 sec
Episode 6006, loss: 102.2014, total reward: 10.3400, 0.09113001823425293 sec
Episode 6007, loss: 2.4348, total reward: 0.2200, 0.10042977333068848 sec
Episode 6008, loss: 51.5337, total reward: 10.0300, 0.029501914978027344 sec
Episode 6009, loss: 4.6641, total reward: 0.7300, 0.09432697296142578 sec
Episode 6010, loss: 6.6796, total reward: 0.7300, 0.09935259819030762 sec
Episode 6011, loss: 142.5426, total reward: 10.9400, 0.09822916984558105 sec
Episode 6012, loss: 105.8277, total reward: 10.3200, 0.06611156463623047 sec
Episode 6013, loss: 27.7365, total reward: 10.3200, 0.032906532287597656 sec
Episode 6014, loss: 84.0699, total reward: 9.8000, 0.06478571891784668 sec
Episode 6015, loss: 1.7343, total reward: 0.2200, 0.09531760215759277 sec
Episode 6016, loss: 90.2246, total reward: 10.1200, 0.0638883113861084 sec
Episode 6017, loss: 86.8861, total reward: 10.5100, 0.06836962699890137 sec
Episode 6018, lo

Episode 6114, loss: 69.3137, total reward: 10.1400, 0.05785059928894043 sec
Episode 6115, loss: 1.9562, total reward: 0.4000, 0.09552407264709473 sec
Episode 6116, loss: 95.0505, total reward: 10.1600, 0.08386111259460449 sec
Episode 6117, loss: 3.7493, total reward: 0.5200, 0.09814071655273438 sec
Episode 6118, loss: 98.3960, total reward: 10.4000, 0.07637453079223633 sec
Episode 6119, loss: 112.2086, total reward: 10.0700, 0.07809567451477051 sec
Episode 6120, loss: 82.6210, total reward: 10.0500, 0.05295062065124512 sec
Episode 6121, loss: 149.4598, total reward: 10.2000, 0.06808710098266602 sec
Episode 6122, loss: 3.0462, total reward: 0.5200, 0.09737467765808105 sec
Episode 6123, loss: 2.2427, total reward: 0.5200, 0.0992424488067627 sec
Episode 6124, loss: 100.1086, total reward: 10.1500, 0.08716273307800293 sec
Episode 6125, loss: 2.8501, total reward: 0.6300, 0.09886598587036133 sec
Episode 6126, loss: 41.0340, total reward: 10.1400, 0.0277864933013916 sec
Episode 6127, loss: 1

Episode 6223, loss: 110.9668, total reward: 10.6100, 0.07248139381408691 sec
Episode 6224, loss: 174.7817, total reward: 10.8100, 0.07265233993530273 sec
Episode 6225, loss: 127.1926, total reward: 10.6300, 0.09599924087524414 sec
Episode 6226, loss: 3.3687, total reward: 0.5200, 0.10059571266174316 sec
Episode 6227, loss: 6.1097, total reward: 0.8300, 0.10061383247375488 sec
Episode 6228, loss: 98.8009, total reward: 10.3700, 0.07997751235961914 sec
Episode 6229, loss: -10.8887, total reward: -0.8600, 0.028821945190429688 sec
Episode 6230, loss: 83.2468, total reward: 10.3700, 0.04053354263305664 sec
Episode 6231, loss: 50.3343, total reward: 10.0000, 0.03420209884643555 sec
Episode 6232, loss: 3.8748, total reward: 0.4200, 0.09373021125793457 sec
Episode 6233, loss: 81.9820, total reward: 10.1300, 0.029262304306030273 sec
Episode 6234, loss: 184.3558, total reward: 10.1900, 0.06878256797790527 sec
Episode 6235, loss: 153.0694, total reward: 10.1500, 0.05255389213562012 sec
Episode 62

Episode 6331, loss: 99.5626, total reward: 10.6300, 0.06472039222717285 sec
Episode 6332, loss: 4.0758, total reward: 0.5200, 0.09698009490966797 sec
Episode 6333, loss: 2.3158, total reward: 0.3200, 0.09882354736328125 sec
Episode 6334, loss: 5.7709, total reward: 0.7300, 0.10033416748046875 sec
Episode 6335, loss: 63.8225, total reward: 10.1000, 0.0396270751953125 sec
Episode 6336, loss: 144.1561, total reward: 10.3600, 0.08151912689208984 sec
Episode 6337, loss: 132.5282, total reward: 10.3800, 0.0787205696105957 sec
Episode 6338, loss: 129.9028, total reward: 10.6700, 0.08420920372009277 sec
Episode 6339, loss: 132.6920, total reward: 10.1800, 0.0677802562713623 sec
Episode 6340, loss: 184.7729, total reward: 10.2500, 0.053887367248535156 sec
Episode 6341, loss: 129.6461, total reward: 10.4100, 0.06755399703979492 sec
Episode 6342, loss: 1.5289, total reward: 0.2200, 0.09601402282714844 sec
Episode 6343, loss: 2.0212, total reward: 0.2200, 0.09843277931213379 sec
Episode 6344, loss

Episode 6441, loss: 136.5186, total reward: 10.4500, 0.08389902114868164 sec
Episode 6442, loss: 3.9916, total reward: 0.6300, 0.09794139862060547 sec
Episode 6443, loss: -0.0884, total reward: 0.2200, 0.09804463386535645 sec
Episode 6444, loss: 2.7273, total reward: 0.4200, 0.09844851493835449 sec
Episode 6445, loss: 83.9941, total reward: 10.0900, 0.04214119911193848 sec
Episode 6446, loss: 156.7153, total reward: 10.2600, 0.08082246780395508 sec
Episode 6447, loss: 118.7962, total reward: 10.0400, 0.05628061294555664 sec
Episode 6448, loss: -1.8648, total reward: 0.0800, 0.09714126586914062 sec
Episode 6449, loss: 121.1615, total reward: 10.1300, 0.06084156036376953 sec
Episode 6450, loss: 137.4369, total reward: 10.4800, 0.046456336975097656 sec
Episode 6451, loss: 88.7014, total reward: 10.1000, 0.04854464530944824 sec
Episode 6452, loss: 214.5831, total reward: 9.9300, 0.08722043037414551 sec
Episode 6453, loss: 55.2476, total reward: 10.3300, 0.031102657318115234 sec
Episode 645

Episode 6551, loss: 0.0320, total reward: 0.2200, 0.09719657897949219 sec
Episode 6552, loss: 0.3694, total reward: 0.2200, 0.09927606582641602 sec
Episode 6553, loss: 132.7508, total reward: 10.2600, 0.05075693130493164 sec
Episode 6554, loss: 3.8071, total reward: 0.5200, 0.09613299369812012 sec
Episode 6555, loss: 99.5503, total reward: 10.3600, 0.05100226402282715 sec
Episode 6556, loss: 1.9917, total reward: 0.5200, 0.09610390663146973 sec
Episode 6557, loss: 92.6836, total reward: 10.2600, 0.0497744083404541 sec
Episode 6558, loss: 0.7086, total reward: 0.3100, 0.09461760520935059 sec
Episode 6559, loss: 3.4455, total reward: 0.4200, 0.09865140914916992 sec
Episode 6560, loss: -0.1765, total reward: 0.0900, 0.09843897819519043 sec
Episode 6561, loss: 1.2297, total reward: 0.2200, 0.09952211380004883 sec
Episode 6562, loss: 3.3482, total reward: 0.6300, 0.0999917984008789 sec
Episode 6563, loss: -2.0998, total reward: 0.0100, 0.09794783592224121 sec
Episode 6564, loss: 67.8760, to

Episode 6660, loss: 4.1115, total reward: 0.4200, 0.09647607803344727 sec
Episode 6661, loss: 138.0486, total reward: 10.7400, 0.0963752269744873 sec
Episode 6662, loss: 112.3396, total reward: 10.2600, 0.08467578887939453 sec
Episode 6663, loss: 3.6534, total reward: 0.4200, 0.09742450714111328 sec
Episode 6664, loss: 4.2098, total reward: 0.5200, 0.10057759284973145 sec
Episode 6665, loss: 2.8774, total reward: 0.4200, 0.10333418846130371 sec
Episode 6666, loss: 5.2985, total reward: 0.5200, 0.10041666030883789 sec
Episode 6667, loss: 92.9227, total reward: 10.2900, 0.07626795768737793 sec
Episode 6668, loss: 82.8081, total reward: 10.7000, 0.07579326629638672 sec
Episode 6669, loss: 132.2926, total reward: 10.1900, 0.07277274131774902 sec
Episode 6670, loss: 142.3281, total reward: 10.5000, 0.07287383079528809 sec
Episode 6671, loss: 4.0402, total reward: 0.3200, 0.0962221622467041 sec
Episode 6672, loss: 82.3289, total reward: 10.2200, 0.03405404090881348 sec
Episode 6673, loss: 2.

Episode 6770, loss: 132.1129, total reward: 10.6800, 0.08106112480163574 sec
Episode 6771, loss: 6.2410, total reward: 0.6300, 0.10246658325195312 sec
Episode 6772, loss: 126.0679, total reward: 10.2900, 0.07528567314147949 sec
Episode 6773, loss: 91.3201, total reward: 10.5800, 0.08058309555053711 sec
Episode 6774, loss: 80.1643, total reward: 10.1900, 0.041632890701293945 sec
Episode 6775, loss: 4.3742, total reward: 0.5200, 0.09914708137512207 sec
Episode 6776, loss: 69.7325, total reward: 10.4400, 0.060396671295166016 sec
Episode 6777, loss: 135.2004, total reward: 10.4000, 0.07135391235351562 sec
Episode 6778, loss: 39.5760, total reward: 10.3400, 0.027422189712524414 sec
Episode 6779, loss: 3.4966, total reward: 0.6300, 0.09418821334838867 sec
Episode 6780, loss: 79.0568, total reward: 10.1700, 0.04778242111206055 sec
Episode 6781, loss: 161.8360, total reward: 10.3600, 0.07625579833984375 sec
Episode 6782, loss: 43.0900, total reward: 10.4200, 0.03414154052734375 sec
Episode 678

Episode 6880, loss: 2.7543, total reward: 0.4200, 0.0942373275756836 sec
Episode 6881, loss: 97.6377, total reward: 10.1200, 0.06689858436584473 sec
Episode 6882, loss: 31.9232, total reward: 10.3100, 0.03589773178100586 sec
Episode 6883, loss: 89.3825, total reward: 10.4100, 0.08822107315063477 sec
Episode 6884, loss: 67.3399, total reward: 10.2900, 0.07455086708068848 sec
Episode 6885, loss: 112.7746, total reward: 10.3800, 0.07832646369934082 sec
Episode 6886, loss: 113.2238, total reward: 10.2900, 0.09368777275085449 sec
Episode 6887, loss: 100.7289, total reward: 10.9400, 0.09898662567138672 sec
Episode 6888, loss: 117.9379, total reward: 10.5600, 0.05640816688537598 sec
Episode 6889, loss: 4.0331, total reward: 0.5200, 0.09781122207641602 sec
Episode 6890, loss: 63.8182, total reward: 10.2100, 0.0677802562713623 sec
Episode 6891, loss: 96.2208, total reward: 10.2400, 0.08922934532165527 sec
Episode 6892, loss: 91.9002, total reward: 10.3000, 0.07155156135559082 sec
Episode 6893, 

Episode 6989, loss: 98.3443, total reward: 10.1800, 0.04642319679260254 sec
Episode 6990, loss: 3.5694, total reward: 0.3200, 0.09428143501281738 sec
Episode 6991, loss: 2.8953, total reward: 0.6300, 0.09953641891479492 sec
Episode 6992, loss: 19.7228, total reward: 9.9600, 0.019248485565185547 sec
Episode 6993, loss: 2.8153, total reward: 0.4200, 0.09258198738098145 sec
Episode 6994, loss: 1.6203, total reward: 0.1700, 0.09741592407226562 sec
Episode 6995, loss: 74.3230, total reward: 10.1900, 0.07411694526672363 sec
Episode 6996, loss: -0.3871, total reward: 0.0100, 0.09530115127563477 sec
Episode 6997, loss: 3.6993, total reward: 0.3200, 0.09822678565979004 sec
Episode 6998, loss: -0.5047, total reward: 0.0100, 0.09717512130737305 sec
Episode 6999, loss: -0.3008, total reward: 0.1100, 0.09948515892028809 sec
Checkpoint saved at episode 7000 to datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120/ckpt_7000_6.7261.pth
Episode 7000, loss: 0.1341, total reward: 0.42

Episode 7097, loss: 0.1179, total reward: 0.2200, 0.09616827964782715 sec
Episode 7098, loss: 72.1650, total reward: 10.3200, 0.0660710334777832 sec
Episode 7099, loss: 142.1372, total reward: 10.4000, 0.0716862678527832 sec
Episode 7100, loss: 66.8540, total reward: 10.5200, 0.06844162940979004 sec
Episode 7101, loss: 114.0363, total reward: 10.2300, 0.09266996383666992 sec
Episode 7102, loss: 0.6192, total reward: 0.3300, 0.0991964340209961 sec
Episode 7103, loss: 73.6756, total reward: 10.3800, 0.08014345169067383 sec
Episode 7104, loss: 1.6751, total reward: 0.5200, 0.09820055961608887 sec
Episode 7105, loss: 61.3623, total reward: 10.3300, 0.06421804428100586 sec
Episode 7106, loss: 42.7212, total reward: 10.5500, 0.0569150447845459 sec
Episode 7107, loss: 54.2207, total reward: 10.1400, 0.05676436424255371 sec
Episode 7108, loss: 79.3456, total reward: 10.5400, 0.059334754943847656 sec
Episode 7109, loss: 61.8146, total reward: 10.4300, 0.0940866470336914 sec
Episode 7110, loss: 

Episode 7207, loss: 1.2161, total reward: 0.2200, 0.09871935844421387 sec
Episode 7208, loss: 104.3002, total reward: 10.1700, 0.08133816719055176 sec
Episode 7209, loss: 3.0804, total reward: 0.4200, 0.09805536270141602 sec
Episode 7210, loss: 60.8471, total reward: 10.2200, 0.06548929214477539 sec
Episode 7211, loss: 1.4539, total reward: 0.5200, 0.098358154296875 sec
Episode 7212, loss: 1.8524, total reward: 0.5200, 0.10054707527160645 sec
Episode 7213, loss: 100.0807, total reward: 10.0800, 0.07695817947387695 sec
Episode 7214, loss: 147.0303, total reward: 10.5200, 0.06736397743225098 sec
Episode 7215, loss: 72.6587, total reward: 10.5600, 0.05450081825256348 sec
Episode 7216, loss: 93.4657, total reward: 10.7800, 0.08039999008178711 sec
Episode 7217, loss: 1.2911, total reward: 0.3200, 0.09792351722717285 sec
Episode 7218, loss: -0.1909, total reward: -0.0600, 0.09793448448181152 sec
Episode 7219, loss: 86.9884, total reward: 10.4900, 0.07813453674316406 sec
Episode 7220, loss: 1

Episode 7317, loss: 0.6705, total reward: 0.1100, 0.09518241882324219 sec
Episode 7318, loss: 27.3610, total reward: 10.2500, 0.02492213249206543 sec
Episode 7319, loss: 40.7140, total reward: 10.5200, 0.06314563751220703 sec
Episode 7320, loss: 1.0992, total reward: 0.3200, 0.09580588340759277 sec
Episode 7321, loss: 135.7061, total reward: 10.4000, 0.07298636436462402 sec
Episode 7322, loss: 2.4103, total reward: 0.3200, 0.09727287292480469 sec
Episode 7323, loss: 80.6180, total reward: 10.3100, 0.06923890113830566 sec
Episode 7324, loss: 2.5226, total reward: 0.4200, 0.09722185134887695 sec
Episode 7325, loss: 70.8264, total reward: 10.4100, 0.07007336616516113 sec
Episode 7326, loss: 1.4615, total reward: 0.4200, 0.09703373908996582 sec
Episode 7327, loss: 106.3548, total reward: 10.7000, 0.07662367820739746 sec
Episode 7328, loss: 2.0826, total reward: 0.5200, 0.09908270835876465 sec
Episode 7329, loss: 84.6342, total reward: 10.2900, 0.07559084892272949 sec
Episode 7330, loss: -0

Episode 7426, loss: 1.7326, total reward: 0.2200, 0.09282612800598145 sec
Episode 7427, loss: 1.6946, total reward: 0.4200, 0.09821510314941406 sec
Episode 7428, loss: 62.1633, total reward: 10.0800, 0.044750213623046875 sec
Episode 7429, loss: 1.4067, total reward: 0.2200, 0.09408068656921387 sec
Episode 7430, loss: 24.8542, total reward: 10.0000, 0.03940582275390625 sec
Episode 7431, loss: 2.9052, total reward: 0.6300, 0.09502339363098145 sec
Episode 7432, loss: 119.8083, total reward: 10.3300, 0.09545183181762695 sec
Episode 7433, loss: -0.1388, total reward: 0.0100, 0.0973355770111084 sec
Episode 7434, loss: 2.4459, total reward: 0.8300, 0.10053348541259766 sec
Episode 7435, loss: 85.1566, total reward: 10.5300, 0.09783673286437988 sec
Episode 7436, loss: 78.3293, total reward: 10.3200, 0.0669102668762207 sec
Episode 7437, loss: 61.7344, total reward: 10.3500, 0.055265188217163086 sec
Episode 7438, loss: 1.1715, total reward: 0.2200, 0.09601521492004395 sec
Episode 7439, loss: -0.5

Episode 7536, loss: 78.4895, total reward: 9.9000, 0.0691673755645752 sec
Episode 7537, loss: 0.2975, total reward: 0.4200, 0.09588861465454102 sec
Episode 7538, loss: 1.6254, total reward: 0.5200, 0.1039729118347168 sec
Episode 7539, loss: 73.1243, total reward: 10.0400, 0.08937740325927734 sec
Episode 7540, loss: 88.3224, total reward: 10.3200, 0.09823822975158691 sec
Episode 7541, loss: 1.4333, total reward: 0.6300, 0.09996914863586426 sec
Episode 7542, loss: 58.9310, total reward: 9.9700, 0.047736406326293945 sec
Episode 7543, loss: 40.6056, total reward: 10.2100, 0.0340268611907959 sec
Episode 7544, loss: -0.7128, total reward: 0.1100, 0.09456038475036621 sec
Episode 7545, loss: 0.4262, total reward: 0.5200, 0.09897994995117188 sec
Episode 7546, loss: 0.6792, total reward: 0.1100, 0.0986785888671875 sec
Episode 7547, loss: 79.4228, total reward: 10.4100, 0.07045698165893555 sec
Episode 7548, loss: 101.3208, total reward: 10.2300, 0.09281134605407715 sec
Episode 7549, loss: 52.8018

Episode 7646, loss: 0.2032, total reward: 0.1100, 0.09912705421447754 sec
Episode 7647, loss: -0.2775, total reward: -0.0900, 0.09815239906311035 sec
Episode 7648, loss: -0.3441, total reward: 0.0100, 0.0980069637298584 sec
Episode 7649, loss: 28.4898, total reward: 10.2400, 0.059745073318481445 sec
Episode 7650, loss: 0.3486, total reward: 0.1100, 0.14393091201782227 sec
Episode 7651, loss: 0.5137, total reward: 0.1100, 0.09889721870422363 sec
Episode 7652, loss: 0.3628, total reward: 0.1100, 0.09892702102661133 sec
Episode 7653, loss: 59.1023, total reward: 10.0800, 0.07714271545410156 sec
Episode 7654, loss: -0.4643, total reward: 0.0100, 0.10129475593566895 sec
Episode 7655, loss: -0.4225, total reward: 0.1100, 0.09910774230957031 sec
Episode 7656, loss: 0.8050, total reward: 0.3200, 0.09901618957519531 sec
Episode 7657, loss: -0.4878, total reward: 0.0100, 0.09862923622131348 sec
Episode 7658, loss: 0.3031, total reward: 0.1100, 0.09932851791381836 sec
Episode 7659, loss: 85.8980,

Episode 7756, loss: 30.9101, total reward: 10.5700, 0.051726579666137695 sec
Episode 7757, loss: 0.0330, total reward: 0.3200, 0.09438824653625488 sec
Episode 7758, loss: 82.6479, total reward: 10.2200, 0.09752917289733887 sec
Episode 7759, loss: 37.2583, total reward: 10.4400, 0.061205148696899414 sec
Episode 7760, loss: 4.2510, total reward: 0.5200, 0.09715032577514648 sec
Episode 7761, loss: 2.2505, total reward: 0.6300, 0.09950137138366699 sec
Episode 7762, loss: 93.9987, total reward: 10.5800, 0.08208441734313965 sec
Episode 7763, loss: 71.3920, total reward: 10.5100, 0.07079267501831055 sec
Episode 7764, loss: 54.8135, total reward: 10.3300, 0.061568260192871094 sec
Episode 7765, loss: 1.5491, total reward: 0.2200, 0.09580183029174805 sec
Episode 7766, loss: 26.8688, total reward: 9.9300, 0.022429227828979492 sec
Episode 7767, loss: 63.8102, total reward: 10.6000, 0.06975674629211426 sec
Episode 7768, loss: 127.6932, total reward: 10.6800, 0.08266663551330566 sec
Episode 7769, lo

Episode 7865, loss: 0.7793, total reward: 0.1000, 0.09536051750183105 sec
Episode 7866, loss: 77.8744, total reward: 10.2200, 0.09686136245727539 sec
Episode 7867, loss: 145.3443, total reward: 10.2200, 0.033678293228149414 sec
Episode 7868, loss: -0.7846, total reward: 0.2200, 0.0930783748626709 sec
Episode 7869, loss: 87.5526, total reward: 10.2000, 0.07064318656921387 sec
Episode 7870, loss: 79.7971, total reward: 10.3100, 0.0671839714050293 sec
Episode 7871, loss: 58.5117, total reward: 10.4800, 0.0784752368927002 sec
Episode 7872, loss: 3.0099, total reward: 0.5200, 0.09813523292541504 sec
Episode 7873, loss: 63.3130, total reward: 10.3200, 0.06833267211914062 sec
Episode 7874, loss: 0.5967, total reward: 0.2200, 0.0959169864654541 sec
Episode 7875, loss: 29.7157, total reward: 10.2400, 0.02826094627380371 sec
Episode 7876, loss: 103.5728, total reward: 10.5300, 0.06022310256958008 sec
Episode 7877, loss: 109.1612, total reward: 10.5900, 0.07613420486450195 sec
Episode 7878, loss:

Episode 7976, loss: 0.4970, total reward: 0.3200, 0.09874439239501953 sec
Episode 7977, loss: 72.7338, total reward: 10.4200, 0.06937789916992188 sec
Episode 7978, loss: 62.8753, total reward: 10.0300, 0.09055614471435547 sec
Episode 7979, loss: 82.1555, total reward: 10.2000, 0.07071590423583984 sec
Episode 7980, loss: 58.1614, total reward: 10.5100, 0.038719892501831055 sec
Episode 7981, loss: 0.8704, total reward: 0.2200, 0.09361100196838379 sec
Episode 7982, loss: 98.4232, total reward: 10.5400, 0.061827898025512695 sec
Episode 7983, loss: 96.3620, total reward: 10.6500, 0.08890247344970703 sec
Episode 7984, loss: 1.1867, total reward: 0.4200, 0.09828996658325195 sec
Episode 7985, loss: 92.9121, total reward: 10.6200, 0.06897354125976562 sec
Episode 7986, loss: 1.5476, total reward: 0.5200, 0.0981283187866211 sec
Episode 7987, loss: 0.3455, total reward: 0.4200, 0.09850263595581055 sec
Episode 7988, loss: 3.7490, total reward: 0.3200, 0.09890174865722656 sec
Episode 7989, loss: 82.

Episode 8084, loss: 106.0920, total reward: 10.2800, 0.0769655704498291 sec
Episode 8085, loss: 42.6717, total reward: 10.4800, 0.04700517654418945 sec
Episode 8086, loss: 6.8778, total reward: 0.7300, 0.09627151489257812 sec
Episode 8087, loss: 14.2674, total reward: 10.3300, 0.03160858154296875 sec
Episode 8088, loss: 11.2169, total reward: 10.1500, 0.020325899124145508 sec
Episode 8089, loss: 149.0002, total reward: 10.3400, 0.08632397651672363 sec
Episode 8090, loss: 95.8585, total reward: 10.4500, 0.08889436721801758 sec
Episode 8091, loss: 1.9416, total reward: 0.5700, 0.09867095947265625 sec
Episode 8092, loss: 56.7321, total reward: 10.4800, 0.08049297332763672 sec
Episode 8093, loss: -0.8662, total reward: -0.0900, 0.09656977653503418 sec
Episode 8094, loss: 40.3730, total reward: 10.2700, 0.050165414810180664 sec
Episode 8095, loss: 113.9075, total reward: 10.3400, 0.08880114555358887 sec
Episode 8096, loss: 3.0597, total reward: 0.4200, 0.10212540626525879 sec
Episode 8097, 

Episode 8193, loss: 53.7299, total reward: 10.4400, 0.06010079383850098 sec
Episode 8194, loss: 45.3270, total reward: 10.0000, 0.06632876396179199 sec
Episode 8195, loss: 50.2062, total reward: 10.3100, 0.06433415412902832 sec
Episode 8196, loss: 54.8216, total reward: 10.5400, 0.09134650230407715 sec
Episode 8197, loss: 141.8170, total reward: 10.5000, 0.07381105422973633 sec
Episode 8198, loss: 77.2022, total reward: 10.2500, 0.0972743034362793 sec
Episode 8199, loss: 76.5100, total reward: 10.2600, 0.05280590057373047 sec
Episode 8200, loss: 1.5630, total reward: 0.7300, 0.09618139266967773 sec
Episode 8201, loss: 67.6150, total reward: 10.4000, 0.07490968704223633 sec
Episode 8202, loss: 40.3633, total reward: 10.3400, 0.05958366394042969 sec
Episode 8203, loss: -0.2485, total reward: -0.0900, 0.09698224067687988 sec
Episode 8204, loss: 78.8257, total reward: 10.3300, 0.09487247467041016 sec
Episode 8205, loss: 0.0110, total reward: 0.2200, 0.09841561317443848 sec
Episode 8206, lo

Episode 8303, loss: 56.6661, total reward: 10.7500, 0.09261226654052734 sec
Episode 8304, loss: 2.6002, total reward: 0.4200, 0.09804272651672363 sec
Episode 8305, loss: 92.2930, total reward: 10.7500, 0.09368896484375 sec
Episode 8306, loss: 52.5106, total reward: 10.3800, 0.07315444946289062 sec
Episode 8307, loss: 2.6852, total reward: 0.5000, 0.09851861000061035 sec
Episode 8308, loss: 56.2038, total reward: 10.4800, 0.0810244083404541 sec
Episode 8309, loss: 82.6639, total reward: 10.3800, 0.0788564682006836 sec
Episode 8310, loss: 37.4767, total reward: 10.5000, 0.07333922386169434 sec
Episode 8311, loss: 15.2920, total reward: 9.9500, 0.020779132843017578 sec
Episode 8312, loss: 51.2299, total reward: 10.1700, 0.04401135444641113 sec
Episode 8313, loss: 80.2266, total reward: 10.2800, 0.07410359382629395 sec
Episode 8314, loss: 64.8906, total reward: 10.5300, 0.08586740493774414 sec
Episode 8315, loss: 67.9604, total reward: 10.1400, 0.05672955513000488 sec
Episode 8316, loss: 1

Episode 8413, loss: 76.9939, total reward: 10.3700, 0.049840450286865234 sec
Episode 8414, loss: 55.3388, total reward: 10.3600, 0.08196330070495605 sec
Episode 8415, loss: 154.8517, total reward: 10.7500, 0.08418679237365723 sec
Episode 8416, loss: 4.0941, total reward: 0.4200, 0.09753537178039551 sec
Episode 8417, loss: 35.6440, total reward: 10.4700, 0.05186796188354492 sec
Episode 8418, loss: 93.1661, total reward: 10.6400, 0.09461665153503418 sec
Episode 8419, loss: 60.3852, total reward: 10.8000, 0.07754945755004883 sec
Episode 8420, loss: 12.6504, total reward: 9.8800, 0.03005075454711914 sec
Episode 8421, loss: 90.7606, total reward: 10.3200, 0.0936741828918457 sec
Episode 8422, loss: 1.8610, total reward: 0.4200, 0.09868621826171875 sec
Episode 8423, loss: 0.0732, total reward: 0.1100, 0.09857773780822754 sec
Episode 8424, loss: 37.5242, total reward: 10.4700, 0.05208420753479004 sec
Episode 8425, loss: 36.5906, total reward: 10.6400, 0.06078505516052246 sec
Episode 8426, loss

Episode 8522, loss: 2.4523, total reward: 0.3200, 0.09811663627624512 sec
Episode 8523, loss: 53.0730, total reward: 10.1000, 0.03598666191101074 sec
Episode 8524, loss: 1.1835, total reward: 0.4200, 0.09458613395690918 sec
Episode 8525, loss: 71.9736, total reward: 10.2200, 0.06471133232116699 sec
Episode 8526, loss: 2.1605, total reward: 0.3200, 0.09676623344421387 sec
Episode 8527, loss: 3.6323, total reward: 0.6300, 0.09946990013122559 sec
Episode 8528, loss: -0.7470, total reward: -0.0900, 0.09746670722961426 sec
Episode 8529, loss: 150.7183, total reward: 10.0600, 0.08332300186157227 sec
Episode 8530, loss: 131.4487, total reward: 10.2400, 0.0853722095489502 sec
Episode 8531, loss: 78.4570, total reward: 10.5000, 0.0738365650177002 sec
Episode 8532, loss: 67.7071, total reward: 10.1900, 0.0424957275390625 sec
Episode 8533, loss: 70.1678, total reward: 10.4600, 0.052663326263427734 sec
Episode 8534, loss: 102.4158, total reward: 10.1800, 0.04467940330505371 sec
Episode 8535, loss:

Episode 8631, loss: 101.3889, total reward: 10.4500, 0.059914588928222656 sec
Episode 8632, loss: 0.5990, total reward: 0.3200, 0.09542512893676758 sec
Episode 8633, loss: 3.8655, total reward: 0.4200, 0.09853339195251465 sec
Episode 8634, loss: 144.3776, total reward: 10.5600, 0.08816099166870117 sec
Episode 8635, loss: 151.0183, total reward: 10.1300, 0.09256243705749512 sec
Episode 8636, loss: 0.2506, total reward: 0.2200, 0.09816336631774902 sec
Episode 8637, loss: 128.2788, total reward: 10.5600, 0.0564427375793457 sec
Episode 8638, loss: 177.4194, total reward: 10.1500, 0.08395266532897949 sec
Episode 8639, loss: 56.9784, total reward: 10.4100, 0.03805685043334961 sec
Episode 8640, loss: 2.9594, total reward: 0.4200, 0.09351277351379395 sec
Episode 8641, loss: 92.6325, total reward: 10.5800, 0.04910731315612793 sec
Episode 8642, loss: 0.5452, total reward: 0.2200, 0.09421753883361816 sec
Episode 8643, loss: 149.4991, total reward: 10.3600, 0.08533263206481934 sec
Episode 8644, lo

Episode 8740, loss: 69.0080, total reward: 10.4500, 0.09009194374084473 sec
Episode 8741, loss: 128.4767, total reward: 10.4800, 0.08035659790039062 sec
Episode 8742, loss: 94.3516, total reward: 10.7700, 0.08570623397827148 sec
Episode 8743, loss: 112.0444, total reward: 10.1600, 0.07367992401123047 sec
Episode 8744, loss: 81.7338, total reward: 10.2500, 0.0864717960357666 sec
Episode 8745, loss: 0.4780, total reward: 0.1100, 0.09774374961853027 sec
Episode 8746, loss: 107.4774, total reward: 10.4500, 0.09657692909240723 sec
Episode 8747, loss: 0.2957, total reward: 0.2200, 0.09833049774169922 sec
Episode 8748, loss: 3.0117, total reward: 0.2900, 0.09902644157409668 sec
Episode 8749, loss: 0.4209, total reward: 0.2200, 0.09921765327453613 sec
Episode 8750, loss: 55.8483, total reward: 10.2800, 0.07932090759277344 sec
Episode 8751, loss: 126.6503, total reward: 10.4200, 0.06633520126342773 sec
Episode 8752, loss: 143.6343, total reward: 10.3800, 0.07879400253295898 sec
Episode 8753, lo

Episode 8849, loss: 117.7209, total reward: 10.1200, 0.09121346473693848 sec
Episode 8850, loss: -1.1764, total reward: 0.1100, 0.09776592254638672 sec
Episode 8851, loss: 113.0715, total reward: 10.0500, 0.08580517768859863 sec
Episode 8852, loss: 1.6000, total reward: 0.3200, 0.09795069694519043 sec
Episode 8853, loss: 113.0122, total reward: 10.5000, 0.07607007026672363 sec
Episode 8854, loss: 125.2215, total reward: 10.0900, 0.0725250244140625 sec
Episode 8855, loss: 170.9623, total reward: 10.1700, 0.07979536056518555 sec
Episode 8856, loss: 1.4335, total reward: 0.4200, 0.09717583656311035 sec
Episode 8857, loss: 1.3742, total reward: 0.3900, 0.0986185073852539 sec
Episode 8858, loss: 1.2765, total reward: 0.5200, 0.10017538070678711 sec
Episode 8859, loss: 128.4129, total reward: 10.2800, 0.07969999313354492 sec
Episode 8860, loss: 69.0237, total reward: 10.4000, 0.040972232818603516 sec
Episode 8861, loss: 144.9751, total reward: 10.3200, 0.09466004371643066 sec
Episode 8862, l

Episode 8958, loss: 0.2228, total reward: 0.0200, 0.09889769554138184 sec
Episode 8959, loss: -1.8671, total reward: 0.0100, 0.09845304489135742 sec
Episode 8960, loss: 1.0477, total reward: 0.3200, 0.0987389087677002 sec
Episode 8961, loss: 135.8581, total reward: 10.3500, 0.09175992012023926 sec
Episode 8962, loss: 57.7577, total reward: 10.2000, 0.040377140045166016 sec
Episode 8963, loss: 82.2419, total reward: 10.1500, 0.05127429962158203 sec
Episode 8964, loss: 4.3550, total reward: 0.3200, 0.09719276428222656 sec
Episode 8965, loss: 93.0634, total reward: 10.2700, 0.051026105880737305 sec
Episode 8966, loss: 40.2974, total reward: 10.2600, 0.01968240737915039 sec
Episode 8967, loss: 107.4739, total reward: 10.3800, 0.04344654083251953 sec
Episode 8968, loss: -1.1685, total reward: 0.0300, 0.09582400321960449 sec
Episode 8969, loss: 0.6634, total reward: 0.1100, 0.09802675247192383 sec
Episode 8970, loss: 1.1524, total reward: 0.3200, 0.0986795425415039 sec
Episode 8971, loss: 47

Episode 9065, loss: 88.6026, total reward: 10.4700, 0.08055329322814941 sec
Episode 9066, loss: 2.2245, total reward: 0.3200, 0.09680962562561035 sec
Episode 9067, loss: 100.2422, total reward: 10.4300, 0.09635639190673828 sec
Episode 9068, loss: -1.1826, total reward: -0.0900, 0.09761786460876465 sec
Episode 9069, loss: 111.1357, total reward: 10.3300, 0.09604859352111816 sec
Episode 9070, loss: 120.0863, total reward: 10.1000, 0.07016277313232422 sec
Episode 9071, loss: 2.5613, total reward: 0.5200, 0.09836435317993164 sec
Episode 9072, loss: 140.8225, total reward: 10.4000, 0.07354998588562012 sec
Episode 9073, loss: 35.0650, total reward: 10.0200, 0.030762434005737305 sec
Episode 9074, loss: 101.3530, total reward: 10.6300, 0.062166452407836914 sec
Episode 9075, loss: 0.1151, total reward: 0.3200, 0.09639883041381836 sec
Episode 9076, loss: 3.9788, total reward: 0.3200, 0.10004973411560059 sec
Episode 9077, loss: -0.1608, total reward: 0.2200, 0.09857511520385742 sec
Episode 9078, 

Episode 9175, loss: 123.1722, total reward: 10.3600, 0.053743839263916016 sec
Episode 9176, loss: 140.5811, total reward: 10.4500, 0.055100440979003906 sec
Episode 9177, loss: 80.8363, total reward: 10.4600, 0.05221986770629883 sec
Episode 9178, loss: 63.3753, total reward: 10.3900, 0.04150748252868652 sec
Episode 9179, loss: 109.8993, total reward: 10.7900, 0.07698702812194824 sec
Episode 9180, loss: 100.4136, total reward: 10.6800, 0.08107757568359375 sec
Episode 9181, loss: -1.4098, total reward: 0.0100, 0.09631919860839844 sec
Episode 9182, loss: 76.8641, total reward: 10.2200, 0.03410458564758301 sec
Episode 9183, loss: 143.3116, total reward: 10.3600, 0.04981732368469238 sec
Episode 9184, loss: 89.0651, total reward: 10.2500, 0.0846858024597168 sec
Episode 9185, loss: 76.6161, total reward: 9.8900, 0.0385584831237793 sec
Episode 9186, loss: 125.3135, total reward: 10.2400, 0.05499076843261719 sec
Episode 9187, loss: 1.6699, total reward: 0.5200, 0.09652829170227051 sec
Episode 91