In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import random

MIN_LIST_LEN = 6
MAX_LIST_LEN = 6
MAX_STEPS = 120
SUCCESS_REWARD = 10.0
EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
GAMMA = 0.98
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = 'datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    '0': 2,
    '1': 3,
    '2': 4,
    '3': 5,
    '4': 6,
    '5': 7,
    '6': 8,
    '7': 9,
    'less': 10,
    'equal': 11,
    'more': 12,
    'len1': 13,
    'len2': 14,
    'len3': 15,
    'len4': 16,
    'len5': 17,
    'len6': 18,
    'len7': 19,
    'len8': 20,
}
inv_vocab = {v: k for k, v in vocab.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        initial_token = 'len{}'.format(self.length)
        return vocab[initial_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_list_len(self):
        return len(self.list)

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 >= self.length or index2 >= self.length or index1 < 0 or index2 < 0:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = -0.01
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = -0.02
            else:
                response_token = vocab['more']
                reward = -0.01
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = 0.1
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -0.1
            else:
                reward = -0.01
            self.indices = None
        else:
            reward = -1.0
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [None]:
# Training Loop
def train(verbose=False):
    torch.autograd.set_detect_anomaly(True)  # Detect anomalies in autograd
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    #load_checkpoint("datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098/ckpt_7000_0.9286.pth", model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    for episode in range(NUM_EPISODES):
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, current_list = env.reset()
        input_tokens = [initial_token_id]
        log_probs = []
        rewards = []
        state = 'expect_action'
        done = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            output = model(input_seq)  # (seq_len, batch_size, vocab_size)
            # Get logits for the last token
            logits = output[-1, 0, :]  # (vocab_size)

            # Check for NaNs in logits
            if torch.isnan(logits).any():
                print(f"Episode {episode}, NaNs in logits before masking.")
                break

            # Get valid tokens based on state
            def get_valid_tokens(state):
                action_tokens = [vocab['Comparison'], vocab['Swap']]
                index_tokens = [vocab[str(i)] for i in range(env.length)]
                if state == 'expect_action':
                    return action_tokens
                elif state == 'expect_index1':
                    return index_tokens[:-1]
                elif state == 'expect_index2':
                    return [x for x in index_tokens if x > input_tokens[-1]]
                else:
                    # Handle unexpected states by defaulting to expect_action
                    return action_tokens

            valid_token_ids = get_valid_tokens(state)

            # Ensure valid_token_ids are within the vocab range
            if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                break

            # Mask invalid tokens
            mask_value = -1e9  # Use a large negative value instead of -inf
            mask = torch.full_like(logits, mask_value).to(device)
            mask[valid_token_ids] = 0
            masked_logits = logits + mask
            
            # Sample action. Have some chance to randomly pick a valid action.
            eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
            if random.random() < eps_threshold:
                masked_logits = masked_logits / 4

            # Check for NaNs in masked_logits
            if torch.isnan(masked_logits).any():
                print(f"Episode {episode}, NaNs in masked_logits after masking.")
                break

            # Compute probabilities
            probs = F.softmax(masked_logits, dim=0)

            # Check for NaNs in probs
            if torch.isnan(probs).any():
                print(f"Episode {episode}, NaNs in probs after softmax.")
                break

            try:
                m = torch.distributions.Categorical(probs)
                action_token = m.sample()
                log_prob = m.log_prob(action_token)
            except ValueError as e:
                print(f"Episode {episode}, error in sampling action: {e}")
                break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        # Can't perform Swap without a previous Comparison
                        reward = -1.0
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if verbose:
                        print("Reward:", reward)
                    #rewards.append(reward)
                    state = 'expect_action'
                else:
                    # Invalid action, end the episode
                    reward = -1.0
                    #rewards.append(reward)
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                response_token, reward, done, current_list = env.step(action_tokens)
                if verbose:
                    print("Reward:", reward)
                #rewards.append(reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                # Invalid state, end the episode
                reward = -1.0
                #rewards.append(reward)
                done = True
            #
            rewards.append(reward)
        #
        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / (episode_cnt + 0.00001)
            episode_cnt = 0
            total_reward = 0.0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{avg_reward:.4f}.pth")
        #
        #print(decode(input_tokens, inv_vocab))
        #print(env.get_list())
        #print("rewards:", rewards)
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"
        
        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns
        returns = []
        R = 0
        gamma = GAMMA
        for r in rewards[::-1]:
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        #print(returns)

        # Check for NaNs in returns
        if torch.isnan(returns).any():
            print(f"Episode {episode}, NaNs in returns.")
            continue

#         if returns.std() != 0 and not torch.isnan(returns.std()):
#             returns = (returns - returns.mean()) / (returns.std() + 1e-9)
#         else:
#             pass #returns = returns - returns.mean()

        # Check for NaNs in standardized returns
        if torch.isnan(returns).any():
            print(f"Episode {episode}, NaNs in standardized returns.")
            continue

        # Compute loss
        loss = 0
        #print("log_probs:", log_probs)
        #print("returns:", returns)
        assert len(log_probs) == len(returns), "log_probs and returns have different sizes!"
        for log_prob, R in zip(log_probs, returns):
            loss -= log_prob * R

        # Check for NaNs in loss
        if torch.isnan(loss):
            print(f"Episode {episode}, NaN in loss.")
            continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        if episode % 1 == 0:
            print(f"Episode {episode}, loss: {loss.item():.4f}, total reward: {sum(rewards):.4f}")

if __name__ == "__main__":
    train(verbose=False)




Episode 0, loss: -0.3107, total reward: -1.0000
Episode 1, loss: -11.1479, total reward: -0.9300
Episode 2, loss: -4.6589, total reward: -1.1100
Episode 3, loss: -26.1625, total reward: -1.2300
Episode 4, loss: -1.8698, total reward: -1.0000
Episode 5, loss: -2.7978, total reward: -1.1100
Episode 6, loss: -4.2947, total reward: -0.9100
Episode 7, loss: -18.5414, total reward: -1.1500
Episode 8, loss: -3.9510, total reward: -0.1900
Episode 9, loss: -1.2770, total reward: -1.0000
Episode 10, loss: -9.7477, total reward: -0.3900
Episode 11, loss: -34.5162, total reward: -1.2100
Episode 12, loss: -6.1891, total reward: -0.2900
Episode 13, loss: -10.5901, total reward: -1.1400
Episode 14, loss: -10.0464, total reward: -0.4900
Episode 15, loss: -3.6458, total reward: -0.1900
Episode 16, loss: -22.5109, total reward: -1.0600
Episode 17, loss: -0.5602, total reward: 0.0100
Episode 18, loss: -3.3806, total reward: -0.1900
Episode 19, loss: -33.6686, total reward: -1.2800
Episode 20, loss: -3.48

Episode 167, loss: 1.3416, total reward: 0.0200
Episode 168, loss: -19.7596, total reward: -1.1300
Episode 169, loss: -24.5095, total reward: -0.8600
Episode 170, loss: 234.2079, total reward: 10.2300
Episode 171, loss: -5.9485, total reward: -0.2900
Episode 172, loss: -0.3839, total reward: 0.1200
Episode 173, loss: 3.2650, total reward: 0.1200
Episode 174, loss: -0.2433, total reward: 0.1300
Episode 175, loss: 1.2813, total reward: 0.1200
Episode 176, loss: 0.2047, total reward: 0.1100
Episode 177, loss: -0.2644, total reward: 0.0200
Episode 178, loss: -3.2346, total reward: 0.0200
Episode 179, loss: 8.3340, total reward: 0.5300
Episode 180, loss: -2.2372, total reward: -1.0000
Episode 181, loss: 6.4319, total reward: 0.3200
Episode 182, loss: -8.1026, total reward: -0.1800
Episode 183, loss: 5.9354, total reward: 0.4300
Episode 184, loss: -2.9575, total reward: -0.0900
Episode 185, loss: 198.7773, total reward: 10.0700
Episode 186, loss: -15.0174, total reward: -1.1400
Episode 187, 

Episode 333, loss: 1.8055, total reward: 0.4300
Episode 334, loss: 267.7041, total reward: 10.4200
Episode 335, loss: 2.1308, total reward: 0.2200
Episode 336, loss: -1.5896, total reward: 0.0200
Episode 337, loss: 3.2949, total reward: 0.2200
Episode 338, loss: 197.0166, total reward: 10.2800
Episode 339, loss: 0.2191, total reward: 0.2200
Episode 340, loss: 311.2283, total reward: 10.4600
Episode 341, loss: 6.3382, total reward: 0.5200
Episode 342, loss: 2.3915, total reward: 0.2000
Episode 343, loss: 265.6197, total reward: 10.3200
Episode 344, loss: 303.3691, total reward: 10.4100
Episode 345, loss: 5.1081, total reward: 0.1200
Episode 346, loss: -10.1673, total reward: -0.7400
Episode 347, loss: 287.4665, total reward: 10.4400
Episode 348, loss: 2.8181, total reward: 0.2200
Episode 349, loss: 276.8141, total reward: 10.0800
Episode 350, loss: 78.0819, total reward: 10.1500
Episode 351, loss: -9.0461, total reward: -0.8400
Episode 352, loss: 6.4302, total reward: 0.4200
Episode 353

Episode 500, loss: 0.9510, total reward: 0.3900
Episode 501, loss: 199.6430, total reward: 10.1800
Episode 502, loss: 6.4992, total reward: 0.3200
Episode 503, loss: 1.7478, total reward: 0.2200
Episode 504, loss: 125.6557, total reward: 10.0300
Episode 505, loss: 7.1281, total reward: 0.7300
Episode 506, loss: 231.2024, total reward: 10.2000
Episode 507, loss: 4.1961, total reward: 0.5300
Episode 508, loss: 153.9921, total reward: 9.9200
Episode 509, loss: 277.7294, total reward: 10.5400
Episode 510, loss: -16.8180, total reward: -0.6300
Episode 511, loss: 4.7742, total reward: 0.4200
Episode 512, loss: -9.5172, total reward: -0.7300
Episode 513, loss: 235.4372, total reward: 10.2100
Episode 514, loss: 191.9195, total reward: 10.4300
Episode 515, loss: -3.6121, total reward: -0.0900
Episode 516, loss: 101.7388, total reward: 10.3400
Episode 517, loss: 5.6329, total reward: 0.3200
Episode 518, loss: 8.4335, total reward: 0.6300
Episode 519, loss: -1.6697, total reward: 0.1100
Episode 5

Episode 665, loss: 3.3620, total reward: 0.2200
Episode 666, loss: 247.5488, total reward: 10.3900
Episode 667, loss: 7.0011, total reward: 0.4300
Episode 668, loss: 259.9960, total reward: 10.5100
Episode 669, loss: 171.7975, total reward: 10.4300
Episode 670, loss: 206.9852, total reward: 10.0400
Episode 671, loss: 8.3602, total reward: 0.5200
Episode 672, loss: 232.0054, total reward: 10.7900
Episode 673, loss: 1.6599, total reward: 0.2200
Episode 674, loss: 3.1306, total reward: 0.2200
Episode 675, loss: 126.2347, total reward: 10.0800
Episode 676, loss: 4.8927, total reward: 0.4300
Episode 677, loss: 4.8380, total reward: 0.3200
Episode 678, loss: 169.2645, total reward: 10.2800
Episode 679, loss: 1.6535, total reward: 0.2200
Episode 680, loss: -1.6785, total reward: -0.1100
Episode 681, loss: 213.0778, total reward: 10.3000
Episode 682, loss: 195.5930, total reward: 10.0300
Episode 683, loss: 4.0775, total reward: 0.2200
Episode 684, loss: 0.3866, total reward: 0.3200
Episode 685

Episode 830, loss: 10.6013, total reward: 0.8100
Episode 831, loss: 7.5030, total reward: 0.5200
Episode 832, loss: 156.3374, total reward: 10.5500
Episode 833, loss: -4.3950, total reward: -0.0900
Episode 834, loss: -2.4839, total reward: 0.0100
Episode 835, loss: 5.0364, total reward: 0.2200
Episode 836, loss: -29.0794, total reward: -0.9700
Episode 837, loss: 3.9965, total reward: 0.3200
Episode 838, loss: 135.8987, total reward: 10.1900
Episode 839, loss: 244.8810, total reward: 10.1400
Episode 840, loss: 206.2394, total reward: 10.4400
Episode 841, loss: 6.7782, total reward: 0.6300
Episode 842, loss: 212.8190, total reward: 10.1300
Episode 843, loss: -6.9455, total reward: -0.8300
Episode 844, loss: 4.1114, total reward: 0.2200
Episode 845, loss: 254.5581, total reward: 10.7900
Episode 846, loss: 5.7514, total reward: 0.5200
Episode 847, loss: 182.8127, total reward: 10.7100
Episode 848, loss: -16.9574, total reward: -0.6300
Episode 849, loss: 216.9240, total reward: 10.3100
Epis

Episode 996, loss: 263.5588, total reward: 10.4400
Episode 997, loss: 7.9482, total reward: 0.5100
Episode 998, loss: 233.4564, total reward: 10.0200
Episode 999, loss: 6.0307, total reward: 0.5200
Checkpoint saved at episode 1000 to datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120/ckpt_1000_3.3660.pth
Episode 1000, loss: -3.0770, total reward: 0.0200
Episode 1001, loss: 248.4426, total reward: 10.3600
Episode 1002, loss: 3.8650, total reward: 0.5200
Episode 1003, loss: 247.6899, total reward: 10.5700
Episode 1004, loss: 324.3512, total reward: 10.4300
Episode 1005, loss: 177.4819, total reward: 10.1400
Episode 1006, loss: -3.6098, total reward: -1.0000
Episode 1007, loss: 2.7091, total reward: 0.4200
Episode 1008, loss: 69.4017, total reward: 10.1600
Episode 1009, loss: 0.5520, total reward: 0.1100
Episode 1010, loss: -1.3133, total reward: 0.0100
Episode 1011, loss: 3.9929, total reward: 0.2200
Episode 1012, loss: 176.4487, total reward: 10.2900
Episode 1013,

Episode 1155, loss: 184.5906, total reward: 10.5000
Episode 1156, loss: 189.5535, total reward: 10.3300
Episode 1157, loss: 3.4963, total reward: 0.4200
Episode 1158, loss: 2.0464, total reward: 0.1200
Episode 1159, loss: 1.4107, total reward: 0.2200
Episode 1160, loss: 125.0609, total reward: 10.5000
Episode 1161, loss: 149.5995, total reward: 9.9700
Episode 1162, loss: 6.8035, total reward: 0.4200
Episode 1163, loss: 266.3115, total reward: 10.2600
Episode 1164, loss: 8.9108, total reward: 0.8300
Episode 1165, loss: 5.2037, total reward: 0.5200
Episode 1166, loss: 173.4553, total reward: 10.3200
Episode 1167, loss: -13.5700, total reward: -0.6300
Episode 1168, loss: 223.3807, total reward: 10.4600
Episode 1169, loss: 214.1233, total reward: 10.0000
Episode 1170, loss: 210.7793, total reward: 9.9600
Episode 1171, loss: 232.5018, total reward: 10.1600
Episode 1172, loss: 10.1055, total reward: 0.8300
Episode 1173, loss: 8.6925, total reward: 9.9900
Episode 1174, loss: 3.3303, total rew

Episode 1316, loss: 6.7555, total reward: 0.4200
Episode 1317, loss: 33.9323, total reward: 9.9700
Episode 1318, loss: 223.4422, total reward: 10.2000
Episode 1319, loss: 225.5203, total reward: 10.5300
Episode 1320, loss: 6.3943, total reward: 0.6300
Episode 1321, loss: -13.2493, total reward: -0.7000
Episode 1322, loss: 5.4759, total reward: 0.5200
Episode 1323, loss: 157.6774, total reward: 10.3600
Episode 1324, loss: 2.0883, total reward: 0.2100
Episode 1325, loss: 225.8680, total reward: 10.3700
Episode 1326, loss: 5.6231, total reward: 0.6300
Episode 1327, loss: 234.5865, total reward: 10.2200
Episode 1328, loss: 0.6267, total reward: 0.1100
Episode 1329, loss: 229.7683, total reward: 10.6400
Episode 1330, loss: -0.1160, total reward: 0.0100
Episode 1331, loss: 122.2603, total reward: 10.2900
Episode 1332, loss: 230.5054, total reward: 10.4100
Episode 1333, loss: 6.8536, total reward: 0.5200
Episode 1334, loss: 90.6531, total reward: 10.3400
Episode 1335, loss: 5.0685, total rewa

Episode 1479, loss: 0.2037, total reward: 0.3200
Episode 1480, loss: 189.9727, total reward: 10.3000
Episode 1481, loss: 159.5829, total reward: 10.2300
Episode 1482, loss: 130.0887, total reward: 10.2000
Episode 1483, loss: 4.1119, total reward: 0.5200
Episode 1484, loss: 131.8557, total reward: 10.2100
Episode 1485, loss: 180.6296, total reward: 10.2100
Episode 1486, loss: 213.5903, total reward: 10.4500
Episode 1487, loss: 242.8848, total reward: 10.4300
Episode 1488, loss: 121.0483, total reward: 10.2900
Episode 1489, loss: 200.8717, total reward: 10.4400
Episode 1490, loss: 223.1963, total reward: 10.8400
Episode 1491, loss: 85.2045, total reward: 10.3000
Episode 1492, loss: 169.0115, total reward: 10.2700
Episode 1493, loss: -16.2938, total reward: -0.3500
Episode 1494, loss: 5.4117, total reward: 0.4200
Episode 1495, loss: 192.7494, total reward: 10.2800
Episode 1496, loss: 213.3878, total reward: 10.3100
Episode 1497, loss: 144.8111, total reward: 10.4500
Episode 1498, loss: 4.

Episode 1640, loss: 1.3509, total reward: 0.2200
Episode 1641, loss: 2.1345, total reward: 0.1100
Episode 1642, loss: 5.2091, total reward: 0.5200
Episode 1643, loss: 152.6971, total reward: 10.3000
Episode 1644, loss: 7.4975, total reward: 0.7300
Episode 1645, loss: 179.4634, total reward: 10.3300
Episode 1646, loss: 3.5844, total reward: 0.3200
Episode 1647, loss: 3.5800, total reward: 0.3200
Episode 1648, loss: 67.3710, total reward: 10.3300
Episode 1649, loss: -0.0351, total reward: 0.2200
Episode 1650, loss: 0.6996, total reward: 0.3200
Episode 1651, loss: 1.0860, total reward: 0.1500
Episode 1652, loss: 202.4940, total reward: 10.5500
Episode 1653, loss: 2.1953, total reward: 0.2200
Episode 1654, loss: 6.2272, total reward: 0.5200
Episode 1655, loss: 6.4087, total reward: 0.6300
Episode 1656, loss: -7.5813, total reward: -0.6400
Episode 1657, loss: -1.1602, total reward: -0.0900
Episode 1658, loss: 217.0460, total reward: 10.3000
Episode 1659, loss: 113.2428, total reward: 10.000

Episode 1803, loss: 105.2754, total reward: 10.2700
Episode 1804, loss: -1.8026, total reward: -0.1600
Episode 1805, loss: 3.7203, total reward: 0.3200
Episode 1806, loss: 149.7538, total reward: 9.9200
Episode 1807, loss: 73.3670, total reward: 9.8800
Episode 1808, loss: 138.7368, total reward: 10.5300
Episode 1809, loss: 0.2412, total reward: 0.0100
Episode 1810, loss: 2.5158, total reward: 0.1100
Episode 1811, loss: 2.1057, total reward: 0.4300
Episode 1812, loss: 159.2133, total reward: 9.9300
Episode 1813, loss: 2.1421, total reward: 0.2200
Episode 1814, loss: -7.8223, total reward: -0.9900
Episode 1815, loss: 2.2306, total reward: 0.2200
Episode 1816, loss: 1.4262, total reward: 0.0100
Episode 1817, loss: -0.8388, total reward: -0.0900
Episode 1818, loss: 1.3168, total reward: 0.0200
Episode 1819, loss: -0.1504, total reward: 0.0200
Episode 1820, loss: 3.9598, total reward: 0.4200
Episode 1821, loss: 0.1976, total reward: 0.0100
Episode 1822, loss: 1.5258, total reward: 0.1100
Ep

Episode 1967, loss: 136.3271, total reward: 10.1700
Episode 1968, loss: 2.0477, total reward: 0.2200
Episode 1969, loss: 167.7652, total reward: 10.4000
Episode 1970, loss: 2.5158, total reward: 0.2000
Episode 1971, loss: 153.4196, total reward: 10.2400
Episode 1972, loss: 9.2622, total reward: 0.6300
Episode 1973, loss: 224.6312, total reward: 10.1000
Episode 1974, loss: 0.0881, total reward: 0.3200
Episode 1975, loss: 5.3525, total reward: 0.4200
Episode 1976, loss: 205.1950, total reward: 10.3500
Episode 1977, loss: 190.6081, total reward: 10.1900
Episode 1978, loss: -0.5302, total reward: 0.2200
Episode 1979, loss: 210.2292, total reward: 10.6600
Episode 1980, loss: 210.2441, total reward: 10.4700
Episode 1981, loss: 146.2922, total reward: 10.5800
Episode 1982, loss: 2.8294, total reward: 0.2200
Episode 1983, loss: 149.6741, total reward: 10.3500
Episode 1984, loss: 198.0084, total reward: 10.2500
Episode 1985, loss: 3.5751, total reward: 0.5200
Episode 1986, loss: 200.9892, total

Episode 2126, loss: 8.4847, total reward: 0.7300
Episode 2127, loss: 167.6365, total reward: 10.2200
Episode 2128, loss: 179.5921, total reward: 10.8000
Episode 2129, loss: 127.6542, total reward: 10.2500
Episode 2130, loss: 129.1665, total reward: 10.3600
Episode 2131, loss: 172.1887, total reward: 10.1600
Episode 2132, loss: -1.4456, total reward: 0.1100
Episode 2133, loss: 166.8190, total reward: 10.3100
Episode 2134, loss: -2.0778, total reward: -0.0100
Episode 2135, loss: 143.5915, total reward: 10.0700
Episode 2136, loss: 202.3814, total reward: 10.0800
Episode 2137, loss: 0.1028, total reward: 0.1100
Episode 2138, loss: 1.3379, total reward: 0.3200
Episode 2139, loss: 155.7300, total reward: 10.2900
Episode 2140, loss: 4.4834, total reward: 0.4200
Episode 2141, loss: 3.0686, total reward: 0.2200
Episode 2142, loss: 157.3430, total reward: 10.0700
Episode 2143, loss: 99.9688, total reward: 10.1400
Episode 2144, loss: 186.0880, total reward: 9.9500
Episode 2145, loss: 1.6369, tota

Episode 2289, loss: 4.5629, total reward: 0.5200
Episode 2290, loss: 6.4348, total reward: 0.7300
Episode 2291, loss: 114.9052, total reward: 10.0700
Episode 2292, loss: 190.1797, total reward: 10.4600
Episode 2293, loss: 187.9806, total reward: 10.5700
Episode 2294, loss: 0.1156, total reward: 0.1100
Episode 2295, loss: 5.2570, total reward: 0.6300
Episode 2296, loss: -1.3087, total reward: -0.0900
Episode 2297, loss: 146.1110, total reward: 10.6500
Episode 2298, loss: -0.7939, total reward: 0.0100
Episode 2299, loss: 74.3152, total reward: 10.0900
Episode 2300, loss: 4.0865, total reward: 0.5200
Episode 2301, loss: 5.1830, total reward: 0.3200
Episode 2302, loss: 178.7226, total reward: 10.8000
Episode 2303, loss: 87.9418, total reward: 10.0100
Episode 2304, loss: 4.9193, total reward: 0.6300
Episode 2305, loss: -1.7922, total reward: 0.0100
Episode 2306, loss: 1.3358, total reward: 0.3200
Episode 2307, loss: 5.2874, total reward: 0.7300
Episode 2308, loss: 245.2568, total reward: 10

Episode 2451, loss: 154.0997, total reward: 10.2100
Episode 2452, loss: 199.9144, total reward: 10.1200
Episode 2453, loss: 3.1018, total reward: 0.4200
Episode 2454, loss: 5.1573, total reward: 0.4200
Episode 2455, loss: 129.7717, total reward: 10.1700
Episode 2456, loss: 174.3380, total reward: 10.4700
Episode 2457, loss: 6.9070, total reward: 0.6300
Episode 2458, loss: 235.6404, total reward: 10.6000
Episode 2459, loss: 205.1333, total reward: 10.2300
Episode 2460, loss: 204.4907, total reward: 10.3800
Episode 2461, loss: -0.7484, total reward: -0.0300
Episode 2462, loss: 4.6363, total reward: 0.5200
Episode 2463, loss: 168.4127, total reward: 10.4700
Episode 2464, loss: -0.0812, total reward: 0.3200
Episode 2465, loss: 158.0752, total reward: 10.3000
Episode 2466, loss: 149.8139, total reward: 10.0000
Episode 2467, loss: 141.6141, total reward: 10.3400
Episode 2468, loss: 182.9937, total reward: 10.2500
Episode 2469, loss: 165.6307, total reward: 10.1400
Episode 2470, loss: 3.8972,

Episode 2614, loss: 3.4654, total reward: 0.5200
Episode 2615, loss: 132.0529, total reward: 10.5100
Episode 2616, loss: 196.5718, total reward: 10.5000
Episode 2617, loss: 6.4910, total reward: 0.7300
Episode 2618, loss: 115.0273, total reward: 10.2200
Episode 2619, loss: 135.0905, total reward: 10.3600
Episode 2620, loss: 3.0252, total reward: 0.5200
Episode 2621, loss: 3.4731, total reward: 0.4200
Episode 2622, loss: 12.6420, total reward: 0.8300
Episode 2623, loss: -1.1672, total reward: 0.2200
Episode 2624, loss: 219.3688, total reward: 10.7400
Episode 2625, loss: 7.7478, total reward: 0.9300
Episode 2626, loss: 5.0068, total reward: 0.8000
Episode 2627, loss: 5.2784, total reward: 0.4200
Episode 2628, loss: 148.2658, total reward: 10.4300
Episode 2629, loss: 5.7114, total reward: 0.6300
Episode 2630, loss: 76.8781, total reward: 10.0900
Episode 2631, loss: 185.2191, total reward: 10.2200
Episode 2632, loss: 88.6431, total reward: 10.4000
Episode 2633, loss: 2.5373, total reward: 

Episode 2777, loss: 1.1808, total reward: 0.3200
Episode 2778, loss: 4.1814, total reward: 0.6100
Episode 2779, loss: 3.0858, total reward: 0.3200
Episode 2780, loss: 0.3174, total reward: -0.0900
Episode 2781, loss: 2.9132, total reward: 0.4200
Episode 2782, loss: 1.0520, total reward: 0.3200
Episode 2783, loss: 112.6270, total reward: 10.6400
Episode 2784, loss: 0.5603, total reward: 0.1100
Episode 2785, loss: 92.6882, total reward: 10.1300
Episode 2786, loss: -1.3410, total reward: 0.1100
Episode 2787, loss: 3.3997, total reward: 0.3200
Episode 2788, loss: 0.2263, total reward: 0.2200
Episode 2789, loss: 4.3008, total reward: 0.4200
Episode 2790, loss: 95.9523, total reward: 10.3700
Episode 2791, loss: 102.7047, total reward: 10.3100
Episode 2792, loss: 1.3514, total reward: 0.4200
Episode 2793, loss: 1.2804, total reward: 0.3200
Episode 2794, loss: 2.9247, total reward: 0.3500
Episode 2795, loss: 3.1010, total reward: 0.4200
Episode 2796, loss: 1.7608, total reward: 0.3200
Episode 

Episode 2940, loss: 107.8554, total reward: 10.1900
Episode 2941, loss: 138.9023, total reward: 10.5900
Episode 2942, loss: 20.6771, total reward: 10.1600
Episode 2943, loss: 204.5890, total reward: 10.0600
Episode 2944, loss: 86.6727, total reward: 10.1700
Episode 2945, loss: 165.9883, total reward: 10.3800
Episode 2946, loss: 4.3135, total reward: 0.4200
Episode 2947, loss: 70.1803, total reward: 10.2600
Episode 2948, loss: 136.5046, total reward: 10.0500
Episode 2949, loss: 0.2724, total reward: 0.0100
Episode 2950, loss: -0.2503, total reward: 0.1100
Episode 2951, loss: 2.1632, total reward: 0.4200
Episode 2952, loss: -0.8994, total reward: 0.0100
Episode 2953, loss: 7.7021, total reward: 0.9300
Episode 2954, loss: -1.5867, total reward: -0.1900
Episode 2955, loss: 3.8126, total reward: 0.6300
Episode 2956, loss: 0.5240, total reward: 0.0100
Episode 2957, loss: 4.2513, total reward: 0.4200
Episode 2958, loss: 1.5356, total reward: 0.6300
Episode 2959, loss: 172.1582, total reward: 

Episode 3101, loss: 2.0302, total reward: 0.3200
Episode 3102, loss: 64.4704, total reward: 10.1000
Episode 3103, loss: 133.9796, total reward: 10.2600
Episode 3104, loss: 119.3800, total reward: 10.1600
Episode 3105, loss: 198.1646, total reward: 10.1700
Episode 3106, loss: 192.9300, total reward: 10.2500
Episode 3107, loss: 195.8585, total reward: 10.5800
Episode 3108, loss: 3.9917, total reward: 0.4200
Episode 3109, loss: 192.1309, total reward: 10.2200
Episode 3110, loss: -1.3476, total reward: 0.0100
Episode 3111, loss: 5.3241, total reward: 0.4200
Episode 3112, loss: 0.1775, total reward: 9.9900
Episode 3113, loss: 0.2102, total reward: 0.0400
Episode 3114, loss: 159.7346, total reward: 10.1900
Episode 3115, loss: 170.6428, total reward: 10.4500
Episode 3116, loss: 162.3525, total reward: 10.9200
Episode 3117, loss: 116.8935, total reward: 10.1300
Episode 3118, loss: 157.4833, total reward: 10.3500
Episode 3119, loss: 223.4239, total reward: 10.4800
Episode 3120, loss: 186.8792, 

Episode 3263, loss: 75.2166, total reward: 10.1200
Episode 3264, loss: 93.8585, total reward: 10.1800
Episode 3265, loss: 2.8655, total reward: 0.3200
Episode 3266, loss: 8.4158, total reward: 0.8300
Episode 3267, loss: 2.3046, total reward: 0.3200
Episode 3268, loss: 2.0232, total reward: 0.2200
Episode 3269, loss: 6.5150, total reward: 0.6300
Episode 3270, loss: 154.0548, total reward: 10.1800
Episode 3271, loss: 3.2476, total reward: 0.4200
Episode 3272, loss: 5.5396, total reward: 0.6300
Episode 3273, loss: 158.5649, total reward: 10.2000
Episode 3274, loss: 204.8183, total reward: 10.1600
Episode 3275, loss: 3.2690, total reward: 0.4200
Episode 3276, loss: 4.0235, total reward: 0.6300
Episode 3277, loss: -3.5568, total reward: -0.1800
Episode 3278, loss: 0.9078, total reward: 0.2200
Episode 3279, loss: 109.1269, total reward: 10.3400
Episode 3280, loss: -2.0752, total reward: -0.0900
Episode 3281, loss: 33.9157, total reward: 9.9800
Episode 3282, loss: 84.1914, total reward: 10.34

Episode 3427, loss: 2.7535, total reward: 0.3200
Episode 3428, loss: 5.9615, total reward: 0.7300
Episode 3429, loss: 176.0325, total reward: 9.9600
Episode 3430, loss: 165.1736, total reward: 10.3500
Episode 3431, loss: 104.1057, total reward: 10.2300
Episode 3432, loss: 3.3949, total reward: 0.3200
Episode 3433, loss: 178.8362, total reward: 10.0200
Episode 3434, loss: 160.8144, total reward: 10.3900
Episode 3435, loss: 1.7111, total reward: 0.3200
Episode 3436, loss: 70.9250, total reward: 10.2100
Episode 3437, loss: 42.8310, total reward: 10.0900
Episode 3438, loss: 111.5112, total reward: 10.2100
Episode 3439, loss: 5.0186, total reward: 0.6300
Episode 3440, loss: 121.6587, total reward: 10.1900
Episode 3441, loss: 88.4003, total reward: 10.0200
Episode 3442, loss: 141.0084, total reward: 10.1700
Episode 3443, loss: 70.8715, total reward: 10.2300
Episode 3444, loss: 121.6264, total reward: 10.3100
Episode 3445, loss: 3.3016, total reward: 0.5200
Episode 3446, loss: 184.4426, total

Episode 3589, loss: 120.3328, total reward: 10.2100
Episode 3590, loss: 1.6072, total reward: 0.4200
Episode 3591, loss: -0.0373, total reward: 0.0400
Episode 3592, loss: 182.1458, total reward: 10.4300
Episode 3593, loss: 1.1477, total reward: 0.2200
Episode 3594, loss: 4.8473, total reward: 0.5200
Episode 3595, loss: 116.9617, total reward: 10.1400
Episode 3596, loss: 1.9806, total reward: 0.3200
Episode 3597, loss: 4.1254, total reward: 0.7300
Episode 3598, loss: 2.5158, total reward: 0.4200
Episode 3599, loss: 2.8767, total reward: 0.4200
Episode 3600, loss: 2.3971, total reward: 0.2200
Episode 3601, loss: 133.0762, total reward: 10.3000
Episode 3602, loss: 5.8400, total reward: 0.7300
Episode 3603, loss: -0.5246, total reward: 0.1100
Episode 3604, loss: 5.2718, total reward: 0.7300
Episode 3605, loss: 6.3113, total reward: 0.7300
Episode 3606, loss: 11.8939, total reward: 0.9300
Episode 3607, loss: 158.5864, total reward: 10.0000
Episode 3608, loss: 3.0341, total reward: 0.4200
Ep

Episode 3753, loss: 0.5517, total reward: 0.3200
Episode 3754, loss: -0.4055, total reward: 0.1100
Episode 3755, loss: 81.6571, total reward: 10.2200
Episode 3756, loss: 120.7684, total reward: 10.1100
Episode 3757, loss: 99.2507, total reward: 10.0200
Episode 3758, loss: 141.2532, total reward: 10.1900
Episode 3759, loss: 0.2435, total reward: 0.2200
Episode 3760, loss: 0.5205, total reward: 0.3200
Episode 3761, loss: 131.2902, total reward: 9.9200
Episode 3762, loss: 2.6986, total reward: 0.4200
Episode 3763, loss: -0.0433, total reward: -0.1300
Episode 3764, loss: 20.3302, total reward: 10.1600
Episode 3765, loss: 124.5731, total reward: 10.1800
Episode 3766, loss: 109.8971, total reward: 10.5800
Episode 3767, loss: 4.2822, total reward: 0.6300
Episode 3768, loss: 99.5557, total reward: 10.0900
Episode 3769, loss: 110.5938, total reward: 10.5300
Episode 3770, loss: 136.5562, total reward: 10.6000
Episode 3771, loss: 5.0220, total reward: 0.5200
Episode 3772, loss: 137.2118, total re

Episode 3917, loss: 2.0296, total reward: 0.4200
Episode 3918, loss: -0.1159, total reward: 0.1100
Episode 3919, loss: 2.3928, total reward: 0.4200
Episode 3920, loss: -0.6502, total reward: 0.2200
Episode 3921, loss: 104.1590, total reward: 10.2500
Episode 3922, loss: 83.6434, total reward: 10.2400
Episode 3923, loss: 0.7100, total reward: 0.2200
Episode 3924, loss: -0.0311, total reward: 0.1100
Episode 3925, loss: 2.5123, total reward: 0.5200
Episode 3926, loss: 78.5147, total reward: 10.5400
Episode 3927, loss: 86.5460, total reward: 10.3600
Episode 3928, loss: 73.2750, total reward: 9.8300
Episode 3929, loss: 85.2161, total reward: 10.3700
Episode 3930, loss: -0.5290, total reward: -0.0900
Episode 3931, loss: 2.0250, total reward: 0.4000
Episode 3932, loss: 113.2459, total reward: 9.9300
Episode 3933, loss: 74.0004, total reward: 10.3400
Episode 3934, loss: 1.3378, total reward: 0.4200
Episode 3935, loss: 96.2855, total reward: 10.3400
Episode 3936, loss: 9.4247, total reward: 10.1

Episode 4078, loss: 148.1524, total reward: 10.3200
Episode 4079, loss: 3.2591, total reward: 0.4200
Episode 4080, loss: 100.6263, total reward: 10.5500
Episode 4081, loss: 104.7961, total reward: 10.2700
Episode 4082, loss: 116.2360, total reward: 10.1300
Episode 4083, loss: 115.3152, total reward: 10.4600
Episode 4084, loss: 106.6937, total reward: 10.6400
Episode 4085, loss: -0.8686, total reward: -0.1900
Episode 4086, loss: 2.8237, total reward: 0.6300
Episode 4087, loss: 1.6187, total reward: 0.2200
Episode 4088, loss: 4.1204, total reward: 0.6300
Episode 4089, loss: 53.4789, total reward: 10.2500
Episode 4090, loss: 82.2016, total reward: 10.4300
Episode 4091, loss: 82.3103, total reward: 10.0200
Episode 4092, loss: 1.3513, total reward: 0.5200
Episode 4093, loss: 104.2061, total reward: 10.5900
Episode 4094, loss: -0.9334, total reward: 0.0100
Episode 4095, loss: 85.7842, total reward: 10.2000
Episode 4096, loss: 91.8588, total reward: 10.4000
Episode 4097, loss: 4.0838, total r

Episode 4242, loss: 4.0656, total reward: 0.7300
Episode 4243, loss: 0.9232, total reward: 0.4200
Episode 4244, loss: 17.6557, total reward: 10.1400
Episode 4245, loss: 67.6597, total reward: 10.2200
Episode 4246, loss: 3.7677, total reward: 0.5200
Episode 4247, loss: 89.0453, total reward: 10.4000
Episode 4248, loss: 2.3075, total reward: 0.4200
Episode 4249, loss: 83.3223, total reward: 10.0800
Episode 4250, loss: -8.3063, total reward: -0.8300
Episode 4251, loss: 80.5134, total reward: 10.4300
Episode 4252, loss: 88.7616, total reward: 10.1200
Episode 4253, loss: 38.4528, total reward: 9.8900
Episode 4254, loss: 107.9174, total reward: 10.3800
Episode 4255, loss: 99.5292, total reward: 10.2500
Episode 4256, loss: 102.1277, total reward: 10.8000
Episode 4257, loss: 151.4991, total reward: 10.7400
Episode 4258, loss: 120.8819, total reward: 10.4400
Episode 4259, loss: 74.1040, total reward: 10.0700
Episode 4260, loss: -0.9242, total reward: 0.0100
Episode 4261, loss: 1.5761, total rew

Episode 4405, loss: 82.1990, total reward: 10.1800
Episode 4406, loss: 122.4988, total reward: 10.3800
Episode 4407, loss: 50.2760, total reward: 10.0900
Episode 4408, loss: 0.6291, total reward: 0.0100
Episode 4409, loss: 2.1423, total reward: 0.2200
Episode 4410, loss: 52.2606, total reward: 10.0700
Episode 4411, loss: 107.9863, total reward: 10.2900
Episode 4412, loss: 2.7510, total reward: 0.3200
Episode 4413, loss: 174.3703, total reward: 10.7000
Episode 4414, loss: 120.0900, total reward: 10.5300
Episode 4415, loss: 112.0687, total reward: 9.8300
Episode 4416, loss: -2.0517, total reward: 0.0100
Episode 4417, loss: -0.4215, total reward: 0.2200
Episode 4418, loss: 76.5798, total reward: 10.2100
Episode 4419, loss: 3.0146, total reward: 0.4200
Episode 4420, loss: -1.2644, total reward: 0.1100
Episode 4421, loss: 129.4959, total reward: 10.5500
Episode 4422, loss: 122.2358, total reward: 10.6800
Episode 4423, loss: 0.7374, total reward: 0.5200
Episode 4424, loss: 114.1496, total re

Episode 4567, loss: 3.3656, total reward: 0.4200
Episode 4568, loss: 185.3888, total reward: 10.7800
Episode 4569, loss: 3.7259, total reward: 0.4200
Episode 4570, loss: 4.2317, total reward: 0.2200
Episode 4571, loss: 2.0154, total reward: 0.5200
Episode 4572, loss: 157.9866, total reward: 10.4200
Episode 4573, loss: 133.2148, total reward: 10.4500
Episode 4574, loss: 2.3994, total reward: 0.3200
Episode 4575, loss: 3.5481, total reward: 0.4200
Episode 4576, loss: 150.0833, total reward: 10.2100
Episode 4577, loss: 4.7603, total reward: 0.4200
Episode 4578, loss: 1.9683, total reward: 0.3200
Episode 4579, loss: 191.8602, total reward: 10.5200
Episode 4580, loss: 157.9406, total reward: 10.4000
Episode 4581, loss: 146.7288, total reward: 10.4500
Episode 4582, loss: 2.5895, total reward: 0.4200
Episode 4583, loss: 156.0365, total reward: 10.0200
Episode 4584, loss: -3.4871, total reward: -0.1600
Episode 4585, loss: 153.9463, total reward: 10.4900
Episode 4586, loss: 173.8105, total rewa

Episode 4730, loss: 164.7148, total reward: 10.2300
Episode 4731, loss: -0.1241, total reward: 0.2200
Episode 4732, loss: 90.5055, total reward: 9.9900
Episode 4733, loss: 175.6311, total reward: 10.5100
Episode 4734, loss: 1.6264, total reward: 0.5200
Episode 4735, loss: 181.7613, total reward: 10.3200
Episode 4736, loss: 104.9104, total reward: 10.3500
Episode 4737, loss: 0.7725, total reward: 0.2200
Episode 4738, loss: 159.2672, total reward: 10.3500
Episode 4739, loss: 174.6070, total reward: 10.0700
Episode 4740, loss: 167.6322, total reward: 10.2600
Episode 4741, loss: 183.3869, total reward: 10.6300
Episode 4742, loss: 0.6035, total reward: 0.0100
Episode 4743, loss: 158.9317, total reward: 10.5100
Episode 4744, loss: 138.6749, total reward: 10.1900
Episode 4745, loss: 127.2402, total reward: 10.3700
Episode 4746, loss: 226.4632, total reward: 10.5000
Episode 4747, loss: 157.2353, total reward: 10.2500
Episode 4748, loss: 174.5002, total reward: 10.4700
Episode 4749, loss: 89.24

Episode 4892, loss: 136.5787, total reward: 10.1300
Episode 4893, loss: 145.5681, total reward: 10.3300
Episode 4894, loss: 164.1485, total reward: 10.7900
Episode 4895, loss: 114.9614, total reward: 10.2100
Episode 4896, loss: 160.2066, total reward: 10.1200
Episode 4897, loss: 173.8672, total reward: 10.0900
Episode 4898, loss: 173.0524, total reward: 10.2400
Episode 4899, loss: 141.0749, total reward: 10.2300
Episode 4900, loss: 146.8012, total reward: 10.4800
Episode 4901, loss: 1.3264, total reward: 0.1100
Episode 4902, loss: 75.8068, total reward: 10.0000
Episode 4903, loss: 166.9656, total reward: 10.1600
Episode 4904, loss: 106.6210, total reward: 10.9000
Episode 4905, loss: 3.5612, total reward: 0.6300
Episode 4906, loss: 147.6466, total reward: 10.2700
Episode 4907, loss: 162.8404, total reward: 10.1800
Episode 4908, loss: 155.7948, total reward: 10.5600
Episode 4909, loss: 147.3651, total reward: 10.3900
Episode 4910, loss: 155.4075, total reward: 10.8700
Episode 4911, loss:

Episode 5052, loss: 104.4368, total reward: 10.1500
Episode 5053, loss: 147.7772, total reward: 10.2700
Episode 5054, loss: 3.8208, total reward: 0.5200
Episode 5055, loss: 146.2527, total reward: 10.6900
Episode 5056, loss: 3.5790, total reward: 0.6300
Episode 5057, loss: 176.0503, total reward: 10.4300
Episode 5058, loss: 1.6819, total reward: 0.3200
Episode 5059, loss: -0.2347, total reward: 0.4200
Episode 5060, loss: 134.1544, total reward: 10.4900
Episode 5061, loss: 73.7533, total reward: 10.2300
Episode 5062, loss: 128.8538, total reward: 10.4300
Episode 5063, loss: 109.1145, total reward: 10.2200
Episode 5064, loss: 0.9220, total reward: 0.4200
Episode 5065, loss: 143.3220, total reward: 10.5300
Episode 5066, loss: 102.5058, total reward: 10.5300
Episode 5067, loss: 62.7352, total reward: 10.3800
Episode 5068, loss: 106.5355, total reward: 10.3100
Episode 5069, loss: 108.0748, total reward: 10.4300
Episode 5070, loss: 84.0371, total reward: 10.5300
Episode 5071, loss: 5.2894, t

Episode 5215, loss: 3.3721, total reward: 0.5200
Episode 5216, loss: 0.0446, total reward: 0.3200
Episode 5217, loss: 114.2565, total reward: 10.7100
Episode 5218, loss: 101.8160, total reward: 10.6800
Episode 5219, loss: 53.9944, total reward: 10.2600
Episode 5220, loss: -0.4136, total reward: 0.2200
Episode 5221, loss: 1.6812, total reward: 0.2200
Episode 5222, loss: 35.2732, total reward: 9.9700
Episode 5223, loss: -0.8590, total reward: 0.0100
Episode 5224, loss: 118.7061, total reward: 10.2600
Episode 5225, loss: 66.9526, total reward: 10.3300
Episode 5226, loss: -1.1631, total reward: -0.0900
Episode 5227, loss: 4.2856, total reward: 0.5200
Episode 5228, loss: 97.8277, total reward: 10.4300
Episode 5229, loss: 89.8604, total reward: 10.2400
Episode 5230, loss: 1.1229, total reward: 0.4200
Episode 5231, loss: 2.4114, total reward: 0.4200
Episode 5232, loss: -1.3945, total reward: 0.1100
Episode 5233, loss: 45.9647, total reward: 10.5900
Episode 5234, loss: 0.6526, total reward: 0.

Episode 5378, loss: 1.8698, total reward: 0.4200
Episode 5379, loss: 50.2636, total reward: 10.2200
Episode 5380, loss: 0.9717, total reward: 0.2200
Episode 5381, loss: 91.8384, total reward: 10.1100
Episode 5382, loss: 45.0086, total reward: 10.2200
Episode 5383, loss: 112.6165, total reward: 10.4800
Episode 5384, loss: 125.1569, total reward: 10.5600
Episode 5385, loss: -2.0791, total reward: 0.1100
Episode 5386, loss: 160.3675, total reward: 10.2500
Episode 5387, loss: 88.7525, total reward: 10.4100
Episode 5388, loss: 136.4328, total reward: 10.4600
Episode 5389, loss: 156.6403, total reward: 10.1500
Episode 5390, loss: -0.7920, total reward: 0.1100
Episode 5391, loss: 2.7312, total reward: 0.6300
Episode 5392, loss: 56.2572, total reward: 10.1000
Episode 5393, loss: 2.5109, total reward: 0.5200
Episode 5394, loss: 50.3427, total reward: 10.3100
Episode 5395, loss: 119.9630, total reward: 10.1900
Episode 5396, loss: 4.1159, total reward: 0.4200
Episode 5397, loss: 74.4719, total re

Episode 5541, loss: 0.2750, total reward: 0.3200
Episode 5542, loss: 101.2233, total reward: 10.4400
Episode 5543, loss: 44.7794, total reward: 10.2300
Episode 5544, loss: 0.3015, total reward: 0.3200
Episode 5545, loss: 114.3979, total reward: 10.1700
Episode 5546, loss: 2.5410, total reward: 0.5200
Episode 5547, loss: 70.4235, total reward: 10.2000
Episode 5548, loss: 0.8491, total reward: 0.5200
Episode 5549, loss: 1.0525, total reward: 0.3000
Episode 5550, loss: 82.0250, total reward: 10.6600
Episode 5551, loss: 123.4617, total reward: 10.4400
Episode 5552, loss: 3.6960, total reward: 0.7300
Episode 5553, loss: 2.6994, total reward: 0.5200
Episode 5554, loss: 128.3747, total reward: 10.4300
Episode 5555, loss: 2.0859, total reward: 0.5200
Episode 5556, loss: 6.7731, total reward: 9.9900
Episode 5557, loss: 4.4050, total reward: 0.6300
Episode 5558, loss: 3.3389, total reward: 0.6300
Episode 5559, loss: 95.5872, total reward: 10.0800
Episode 5560, loss: 1.1148, total reward: 0.4200


Episode 5704, loss: 1.1892, total reward: 0.4200
Episode 5705, loss: 0.7372, total reward: 0.4200
Episode 5706, loss: 3.1007, total reward: 0.4200
Episode 5707, loss: 113.1412, total reward: 10.3300
Episode 5708, loss: 97.4537, total reward: 10.3300
Episode 5709, loss: 0.4141, total reward: 0.4200
Episode 5710, loss: 68.8567, total reward: 10.4300
Episode 5711, loss: 161.7784, total reward: 10.3200
Episode 5712, loss: 58.9469, total reward: 10.1000
Episode 5713, loss: 110.3384, total reward: 10.5600
Episode 5714, loss: -1.2140, total reward: 0.2200
Episode 5715, loss: 0.2563, total reward: 0.2200
Episode 5716, loss: 85.8044, total reward: 10.4000
Episode 5717, loss: 147.3305, total reward: 10.1200
Episode 5718, loss: 90.4859, total reward: 10.3000
Episode 5719, loss: 1.0981, total reward: 0.4200
Episode 5720, loss: 5.6338, total reward: 0.7600
Episode 5721, loss: 1.0647, total reward: 0.6300
Episode 5722, loss: 1.0709, total reward: 0.3200
Episode 5723, loss: 0.7919, total reward: 0.32

Episode 5867, loss: 4.3612, total reward: 0.5200
Episode 5868, loss: 10.0597, total reward: 10.1700
Episode 5869, loss: 31.0992, total reward: 10.2300
Episode 5870, loss: 106.6056, total reward: 10.2500
Episode 5871, loss: 113.7245, total reward: 10.1300
Episode 5872, loss: 138.2529, total reward: 10.1300
Episode 5873, loss: 84.2944, total reward: 10.4700
Episode 5874, loss: 85.2179, total reward: 10.2500
Episode 5875, loss: 1.6729, total reward: 0.5300
Episode 5876, loss: 89.6352, total reward: 10.4600
Episode 5877, loss: 0.7787, total reward: 0.3200
Episode 5878, loss: 2.0077, total reward: 0.4200
Episode 5879, loss: 4.7748, total reward: 10.0700
Episode 5880, loss: 6.1586, total reward: 10.0700
Episode 5881, loss: 128.6176, total reward: 10.0400
Episode 5882, loss: 3.0684, total reward: 0.3200
Episode 5883, loss: 47.9432, total reward: 10.2000
Episode 5884, loss: 1.5788, total reward: 0.5200
Episode 5885, loss: 0.3224, total reward: 0.1100
Episode 5886, loss: 1.8748, total reward: 0

Episode 6028, loss: 2.0738, total reward: 0.6300
Episode 6029, loss: 1.3178, total reward: 0.3200
Episode 6030, loss: 53.2948, total reward: 10.4500
Episode 6031, loss: 140.6680, total reward: 10.7500
Episode 6032, loss: 115.6769, total reward: 10.1400
Episode 6033, loss: 55.8974, total reward: 10.3500
Episode 6034, loss: 1.0623, total reward: 0.4200
Episode 6035, loss: 90.1766, total reward: 10.3600
Episode 6036, loss: 3.2494, total reward: 0.4500
Episode 6037, loss: 15.2904, total reward: 10.1500
Episode 6038, loss: 89.0805, total reward: 10.6100
Episode 6039, loss: 1.0157, total reward: 0.2200
Episode 6040, loss: 3.4862, total reward: 0.5200
Episode 6041, loss: 3.0149, total reward: 0.5200
Episode 6042, loss: 66.6851, total reward: 10.2600
Episode 6043, loss: 65.9310, total reward: 10.1800
Episode 6044, loss: 0.8033, total reward: 0.4200
Episode 6045, loss: 1.3143, total reward: 0.2200
Episode 6046, loss: 2.5242, total reward: 0.7300
Episode 6047, loss: 0.4210, total reward: 0.0800


Episode 6191, loss: 30.8606, total reward: 10.0400
Episode 6192, loss: 123.6632, total reward: 10.2000
Episode 6193, loss: 77.9786, total reward: 10.1100
Episode 6194, loss: 1.8345, total reward: 0.4100
Episode 6195, loss: 126.9537, total reward: 10.3400
Episode 6196, loss: 102.5474, total reward: 10.0200
Episode 6197, loss: 90.2461, total reward: 10.4000
Episode 6198, loss: 106.7941, total reward: 10.3600
Episode 6199, loss: 99.3414, total reward: 10.3200
Episode 6200, loss: 2.5746, total reward: 0.4200
Episode 6201, loss: 1.1941, total reward: 0.3200
Episode 6202, loss: -1.9005, total reward: -0.0100
Episode 6203, loss: 81.9738, total reward: 10.1900
Episode 6204, loss: 91.4756, total reward: 10.2000
Episode 6205, loss: 86.3284, total reward: 10.4700
Episode 6206, loss: -0.1594, total reward: 0.3200
Episode 6207, loss: 1.6525, total reward: 0.1100
Episode 6208, loss: 16.0815, total reward: 10.1300
Episode 6209, loss: 83.7242, total reward: 10.4000
Episode 6210, loss: 1.6581, total re

Episode 6354, loss: 19.6767, total reward: 10.1400
Episode 6355, loss: 124.8236, total reward: 10.2700
Episode 6356, loss: 0.3524, total reward: 0.1100
Episode 6357, loss: 122.6560, total reward: 10.1700
Episode 6358, loss: 51.4363, total reward: 10.2200
Episode 6359, loss: 150.6055, total reward: 10.0700
Episode 6360, loss: 109.1765, total reward: 10.3200
Episode 6361, loss: 110.4079, total reward: 10.1900
Episode 6362, loss: 80.4826, total reward: 10.3600
Episode 6363, loss: 61.3021, total reward: 10.2500
Episode 6364, loss: 88.7650, total reward: 10.3500
Episode 6365, loss: 1.7168, total reward: 0.4200
Episode 6366, loss: -1.3786, total reward: 0.1100
Episode 6367, loss: 63.9202, total reward: 10.3000
Episode 6368, loss: 98.3572, total reward: 10.0800
Episode 6369, loss: 58.5756, total reward: 9.9200
Episode 6370, loss: 89.0664, total reward: 10.6700
Episode 6371, loss: 68.9551, total reward: 10.5900
Episode 6372, loss: 69.7912, total reward: 10.2800
Episode 6373, loss: 1.2704, tota

Episode 6518, loss: 26.2536, total reward: 10.2300
Episode 6519, loss: 2.6384, total reward: 0.4200
Episode 6520, loss: 96.6370, total reward: 10.4100
Episode 6521, loss: 2.7699, total reward: 0.5200
Episode 6522, loss: 116.5994, total reward: 10.3300
Episode 6523, loss: 0.1595, total reward: 0.1100
Episode 6524, loss: -1.2393, total reward: 0.0100
Episode 6525, loss: 0.3380, total reward: 0.2200
Episode 6526, loss: 3.0986, total reward: 0.8300
Episode 6527, loss: 45.1032, total reward: 10.1300
Episode 6528, loss: 1.7598, total reward: 0.6100
Episode 6529, loss: 2.7986, total reward: 0.7300
Episode 6530, loss: 1.9695, total reward: 0.4200
Episode 6531, loss: 56.6677, total reward: 10.4600
Episode 6532, loss: 2.3801, total reward: 0.4200
Episode 6533, loss: 88.2092, total reward: 10.1700
Episode 6534, loss: 103.2557, total reward: 10.1400
Episode 6535, loss: 51.0885, total reward: 10.8500
Episode 6536, loss: 2.1471, total reward: 0.5200
Episode 6537, loss: 44.5887, total reward: 10.0100

Episode 6681, loss: 118.4520, total reward: 10.2700
Episode 6682, loss: 2.9037, total reward: 0.5200
Episode 6683, loss: 0.8096, total reward: 0.1600
Episode 6684, loss: -0.0087, total reward: 0.1100
Episode 6685, loss: 93.0168, total reward: 10.4600
Episode 6686, loss: 51.0427, total reward: 10.2300
Episode 6687, loss: 111.5641, total reward: 10.4400
Episode 6688, loss: 1.7965, total reward: 0.5200
Episode 6689, loss: 84.3784, total reward: 10.4000
Episode 6690, loss: 78.5898, total reward: 10.4400
Episode 6691, loss: 1.2437, total reward: 0.4000
Episode 6692, loss: 4.7846, total reward: 10.1400
Episode 6693, loss: 0.2075, total reward: 0.4200
Episode 6694, loss: 1.1184, total reward: 0.4200
Episode 6695, loss: 0.7239, total reward: 0.1100
Episode 6696, loss: 0.6701, total reward: 0.3200
Episode 6697, loss: 41.7833, total reward: 10.0700
Episode 6698, loss: 61.5188, total reward: 10.1800
Episode 6699, loss: 54.4944, total reward: 10.0100
Episode 6700, loss: 2.4127, total reward: 0.730

Episode 6845, loss: 96.3003, total reward: 10.5600
Episode 6846, loss: 96.7157, total reward: 10.3900
Episode 6847, loss: 12.2764, total reward: 10.0400
Episode 6848, loss: 99.0005, total reward: 10.1900
Episode 6849, loss: 98.0200, total reward: 10.2100
Episode 6850, loss: 108.0161, total reward: 10.5500
Episode 6851, loss: 79.6625, total reward: 10.2500
Episode 6852, loss: 103.7467, total reward: 10.2600
Episode 6853, loss: 28.0616, total reward: 10.2100
Episode 6854, loss: 3.8965, total reward: 0.6300
Episode 6855, loss: 69.5743, total reward: 10.5500
Episode 6856, loss: 58.6796, total reward: 10.2300
Episode 6857, loss: 68.7847, total reward: 10.4400
Episode 6858, loss: 116.5102, total reward: 10.6900
Episode 6859, loss: 1.4995, total reward: 0.4200
Episode 6860, loss: 0.4893, total reward: 0.1100
Episode 6861, loss: 144.4950, total reward: 10.4600
Episode 6862, loss: 100.4606, total reward: 10.1000
Episode 6863, loss: 73.6063, total reward: 10.2900
Episode 6864, loss: 108.4227, to

Episode 7005, loss: 79.1538, total reward: 10.2900
Episode 7006, loss: 54.6835, total reward: 10.3000
Episode 7007, loss: 86.0267, total reward: 10.5100
Episode 7008, loss: 43.7845, total reward: 10.3200
Episode 7009, loss: 95.4516, total reward: 10.4700
Episode 7010, loss: 107.4289, total reward: 10.2600
Episode 7011, loss: 1.5431, total reward: 0.6300
Episode 7012, loss: 52.4210, total reward: 10.6100
Episode 7013, loss: 38.2279, total reward: 10.3400
Episode 7014, loss: 71.0313, total reward: 10.6700
Episode 7015, loss: 0.9999, total reward: 0.2200
Episode 7016, loss: 0.4204, total reward: 0.3200
Episode 7017, loss: 98.0346, total reward: 10.3300
Episode 7018, loss: 1.0582, total reward: 0.3200
Episode 7019, loss: 70.1762, total reward: 10.2200
Episode 7020, loss: 0.1468, total reward: 0.0100
Episode 7021, loss: 17.1510, total reward: 10.2400
Episode 7022, loss: 98.5487, total reward: 10.6200
Episode 7023, loss: 66.9159, total reward: 10.5700
Episode 7024, loss: 29.5970, total rewar

Episode 7169, loss: 2.4621, total reward: 0.6300
Episode 7170, loss: 59.9407, total reward: 10.3100
Episode 7171, loss: 2.3110, total reward: 0.7300
Episode 7172, loss: 40.7061, total reward: 10.4900
Episode 7173, loss: 35.2420, total reward: 10.2000
Episode 7174, loss: 100.9681, total reward: 10.4100
Episode 7175, loss: 89.6632, total reward: 10.2700
Episode 7176, loss: 97.1683, total reward: 10.6900
Episode 7177, loss: -0.5757, total reward: 0.2200
Episode 7178, loss: 63.6705, total reward: 10.1400
Episode 7179, loss: 92.2693, total reward: 10.0900
Episode 7180, loss: 2.2985, total reward: 0.5200
Episode 7181, loss: 59.8110, total reward: 10.5600
Episode 7182, loss: 45.4551, total reward: 10.3800
Episode 7183, loss: 86.8904, total reward: 10.2800
Episode 7184, loss: 2.1883, total reward: 0.4200
Episode 7185, loss: 0.4552, total reward: 0.3200
Episode 7186, loss: 80.9765, total reward: 10.6300
Episode 7187, loss: 99.5571, total reward: 10.4400
Episode 7188, loss: 91.2715, total reward

Episode 7332, loss: 1.1521, total reward: 0.2200
Episode 7333, loss: 49.4506, total reward: 10.1700
Episode 7334, loss: 1.0236, total reward: 0.4200
Episode 7335, loss: 99.5352, total reward: 10.2800
Episode 7336, loss: 2.3461, total reward: 0.6300
Episode 7337, loss: 67.6142, total reward: 10.4000
Episode 7338, loss: 49.2222, total reward: 10.6400
Episode 7339, loss: 68.0690, total reward: 10.1100
Episode 7340, loss: 4.9985, total reward: 10.0500
Episode 7341, loss: 97.3512, total reward: 10.3600
Episode 7342, loss: 3.8793, total reward: 0.8300
Episode 7343, loss: 63.0463, total reward: 10.4100
Episode 7344, loss: 0.4611, total reward: 0.4200
Episode 7345, loss: 93.5313, total reward: 10.5600
Episode 7346, loss: 57.5190, total reward: 10.0500
Episode 7347, loss: 66.1672, total reward: 10.7200
Episode 7348, loss: 2.8932, total reward: 0.6300
Episode 7349, loss: 40.4457, total reward: 10.1900
Episode 7350, loss: 4.8760, total reward: 0.7300
Episode 7351, loss: 47.4019, total reward: 10.

Episode 7495, loss: 102.2141, total reward: 10.3600
Episode 7496, loss: 2.3057, total reward: 0.4200
Episode 7497, loss: 77.0879, total reward: 10.4200
Episode 7498, loss: 66.4447, total reward: 10.3100
Episode 7499, loss: 2.4781, total reward: 0.6300
Episode 7500, loss: 64.9033, total reward: 10.5300
Episode 7501, loss: 73.5037, total reward: 10.2200
Episode 7502, loss: 40.3980, total reward: 10.0700
Episode 7503, loss: 139.5348, total reward: 10.1900
Episode 7504, loss: 0.8547, total reward: 0.3900
Episode 7505, loss: -1.0207, total reward: -0.0900
Episode 7506, loss: 77.4305, total reward: 10.2400
Episode 7507, loss: 2.1502, total reward: 0.5200
Episode 7508, loss: 0.6601, total reward: 0.1800
Episode 7509, loss: 2.1056, total reward: 0.6300
Episode 7510, loss: 0.4541, total reward: 0.0100
Episode 7511, loss: 76.1074, total reward: 9.9500
Episode 7512, loss: 0.3598, total reward: 0.1100
Episode 7513, loss: 4.2713, total reward: 0.4200
Episode 7514, loss: 0.0121, total reward: 0.2200

Episode 7658, loss: 1.5345, total reward: 0.7300
Episode 7659, loss: 1.7401, total reward: 0.8300
Episode 7660, loss: 1.4209, total reward: 0.7300
Episode 7661, loss: 1.9071, total reward: 0.5200
Episode 7662, loss: 51.2881, total reward: 10.1600
Episode 7663, loss: 63.7940, total reward: 10.3700
Episode 7664, loss: 78.2866, total reward: 10.3200
Episode 7665, loss: 29.0539, total reward: 10.6200
Episode 7666, loss: 1.6082, total reward: 0.7100
Episode 7667, loss: 0.0490, total reward: 0.2200
Episode 7668, loss: 35.3125, total reward: 10.6000
Episode 7669, loss: 66.2859, total reward: 10.6400
Episode 7670, loss: 0.3398, total reward: 0.5200
Episode 7671, loss: 169.7657, total reward: 10.1900
Episode 7672, loss: 2.1307, total reward: 0.4200
Episode 7673, loss: 30.5633, total reward: 10.4800
Episode 7674, loss: 45.9712, total reward: 10.2400
Episode 7675, loss: -0.4007, total reward: 0.3200
Episode 7676, loss: 67.7713, total reward: 10.4200
Episode 7677, loss: 21.0347, total reward: 10.2

Episode 7821, loss: 1.6148, total reward: 0.2200
Episode 7822, loss: 1.7065, total reward: 0.5200
Episode 7823, loss: 69.0412, total reward: 10.6100
Episode 7824, loss: 1.4373, total reward: 0.3000
Episode 7825, loss: 3.7455, total reward: 10.4300
Episode 7826, loss: 41.1350, total reward: 10.4300
Episode 7827, loss: 1.9512, total reward: 0.6300
Episode 7828, loss: 27.2191, total reward: 9.8000
Episode 7829, loss: 56.3946, total reward: 10.4900
Episode 7830, loss: 0.1955, total reward: 0.3200
Episode 7831, loss: 36.9517, total reward: 10.6300
Episode 7832, loss: 1.8989, total reward: 0.6300
Episode 7833, loss: 47.9630, total reward: 10.1700
Episode 7834, loss: 3.7517, total reward: 0.7300
Episode 7835, loss: 98.5238, total reward: 11.0800
Episode 7836, loss: 73.9449, total reward: 10.5500
Episode 7837, loss: 69.1324, total reward: 10.1500
Episode 7838, loss: 60.4257, total reward: 10.3700
Episode 7839, loss: 60.9188, total reward: 10.2500
Episode 7840, loss: 33.9982, total reward: 10.1

Episode 7984, loss: 0.6248, total reward: 0.4200
Episode 7985, loss: 39.9206, total reward: 10.3000
Episode 7986, loss: 57.8339, total reward: 10.4100
Episode 7987, loss: 28.6205, total reward: 10.5600
Episode 7988, loss: -0.1334, total reward: 0.2200
Episode 7989, loss: 77.8050, total reward: 10.2700
Episode 7990, loss: 1.4172, total reward: 0.2200
Episode 7991, loss: 88.1027, total reward: 10.2600
Episode 7992, loss: 38.5935, total reward: 10.4800
Episode 7993, loss: 95.7869, total reward: 10.3900
Episode 7994, loss: 38.2671, total reward: 10.1600
Episode 7995, loss: 55.8353, total reward: 10.4400
Episode 7996, loss: 51.2163, total reward: 10.9600
Episode 7997, loss: 33.9012, total reward: 10.3200
Episode 7998, loss: 50.3740, total reward: 9.9400
Episode 7999, loss: 20.5802, total reward: 10.4200
Checkpoint saved at episode 8000 to datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120/ckpt_8000_6.6160.pth
Episode 8000, loss: 21.4150, total reward: 10.0700
Episode 

Episode 8145, loss: 42.1504, total reward: 10.4100
Episode 8146, loss: 1.6335, total reward: 10.0500
Episode 8147, loss: 12.6449, total reward: 10.2400
Episode 8148, loss: 15.4415, total reward: 10.4100
Episode 8149, loss: 0.3817, total reward: 0.3200
Episode 8150, loss: 36.6975, total reward: 10.2000
Episode 8151, loss: 23.8574, total reward: 9.9700
Episode 8152, loss: 25.0374, total reward: 10.4100
Episode 8153, loss: 0.7827, total reward: 0.6300
Episode 8154, loss: 2.1579, total reward: 0.7300
Episode 8155, loss: 26.2702, total reward: 10.2400
Episode 8156, loss: 0.5985, total reward: 0.5200
Episode 8157, loss: 39.2176, total reward: 10.3500
Episode 8158, loss: 33.1347, total reward: 10.1000
Episode 8159, loss: 0.1507, total reward: 0.2200
Episode 8160, loss: 0.8066, total reward: 0.3000
Episode 8161, loss: 30.5585, total reward: 10.4400
Episode 8162, loss: 1.7710, total reward: 0.6300
Episode 8163, loss: 29.6666, total reward: 10.3100
Episode 8164, loss: 25.9378, total reward: 10.0

Episode 8309, loss: 0.2333, total reward: 0.6300
Episode 8310, loss: -0.5008, total reward: 0.2300
Episode 8311, loss: -0.0629, total reward: 0.1100
Episode 8312, loss: -0.1608, total reward: 0.6800
Episode 8313, loss: 23.8905, total reward: 10.2500
Episode 8314, loss: 16.2545, total reward: 10.2300
Episode 8315, loss: 50.5545, total reward: 10.2800
Episode 8316, loss: 1.1632, total reward: 0.6300
Episode 8317, loss: 0.8282, total reward: 0.7300
Episode 8318, loss: 0.1906, total reward: 0.3200
Episode 8319, loss: 10.0416, total reward: 10.3900
Episode 8320, loss: 17.4161, total reward: 10.0800
Episode 8321, loss: 9.7851, total reward: 10.3800
Episode 8322, loss: 3.8619, total reward: 10.3100
Episode 8323, loss: 39.7595, total reward: 10.4400
Episode 8324, loss: 28.1545, total reward: 10.1100
Episode 8325, loss: 31.0375, total reward: 10.6000
Episode 8326, loss: -0.0383, total reward: 0.3200
Episode 8327, loss: 1.8077, total reward: 0.4200
Episode 8328, loss: 33.0725, total reward: 10.8

Episode 8473, loss: 22.3389, total reward: 10.4700
Episode 8474, loss: 1.2942, total reward: 0.4200
Episode 8475, loss: -0.4392, total reward: 0.1100
Episode 8476, loss: -0.0415, total reward: 0.3200
Episode 8477, loss: 0.3021, total reward: 0.5200
Episode 8478, loss: -1.6654, total reward: 0.0100
Episode 8479, loss: 16.2483, total reward: 10.0100
Episode 8480, loss: 0.9329, total reward: 0.6300
Episode 8481, loss: 1.0922, total reward: 0.3200
Episode 8482, loss: 0.5187, total reward: 0.7300
Episode 8483, loss: 22.3885, total reward: 9.9900
Episode 8484, loss: 35.6709, total reward: 10.5400
Episode 8485, loss: 0.0419, total reward: 0.2200
Episode 8486, loss: 27.5126, total reward: 10.6100
Episode 8487, loss: 0.0524, total reward: 0.3200
Episode 8488, loss: 42.7302, total reward: 10.1600
Episode 8489, loss: -0.3675, total reward: 0.3200
Episode 8490, loss: 1.4566, total reward: 0.3200
Episode 8491, loss: -0.1855, total reward: 0.2200
Episode 8492, loss: 2.5668, total reward: 0.8300
Epis

Episode 8637, loss: 0.4687, total reward: 0.6300
Episode 8638, loss: 0.7936, total reward: 0.3200
Episode 8639, loss: 18.7881, total reward: 10.4400
Episode 8640, loss: 0.3656, total reward: 0.4200
Episode 8641, loss: 39.7012, total reward: 10.3600
Episode 8642, loss: 2.0207, total reward: 0.1200
Episode 8643, loss: 0.2973, total reward: 0.4200
Episode 8644, loss: 0.3826, total reward: 0.4200
Episode 8645, loss: 1.0681, total reward: 0.4200
Episode 8646, loss: 16.6748, total reward: 10.1900
Episode 8647, loss: 36.6098, total reward: 10.5100
Episode 8648, loss: 1.0715, total reward: 0.6300
Episode 8649, loss: -0.2849, total reward: 0.0100
Episode 8650, loss: 50.4093, total reward: 10.3400
Episode 8651, loss: 0.2631, total reward: 0.3000
Episode 8652, loss: -0.0288, total reward: 0.3200
Episode 8653, loss: 39.9218, total reward: 10.2500
Episode 8654, loss: 6.6228, total reward: 10.4200
Episode 8655, loss: 0.8992, total reward: 0.2200
Episode 8656, loss: 0.3662, total reward: 0.5200
Episo

Episode 8801, loss: 29.3916, total reward: 10.2200
Episode 8802, loss: 26.1562, total reward: 10.4400
Episode 8803, loss: 0.2141, total reward: 0.1100
Episode 8804, loss: 26.4673, total reward: 10.4200
Episode 8805, loss: 0.1780, total reward: 0.3200
Episode 8806, loss: -0.4668, total reward: 0.1100
Episode 8807, loss: 0.0967, total reward: 0.1100
Episode 8808, loss: 0.5011, total reward: 0.3200
Episode 8809, loss: 0.3991, total reward: 0.1200
Episode 8810, loss: 5.8625, total reward: 10.3100
Episode 8811, loss: -0.0298, total reward: 0.2200
Episode 8812, loss: 0.0059, total reward: 0.2200
Episode 8813, loss: 0.0547, total reward: 0.1100
Episode 8814, loss: 1.6771, total reward: 0.4200
Episode 8815, loss: 0.0077, total reward: 0.2200
Episode 8816, loss: 0.4972, total reward: 0.4200
Episode 8817, loss: 11.7024, total reward: 10.3100
Episode 8818, loss: 1.2144, total reward: 0.5200
Episode 8819, loss: 19.5283, total reward: 10.5100
Episode 8820, loss: 2.5771, total reward: 0.4200
Episode

Episode 8966, loss: 1.3118, total reward: 0.6300
Episode 8967, loss: 66.2462, total reward: 10.7700
Episode 8968, loss: 46.7751, total reward: 10.4500
Episode 8969, loss: 0.1138, total reward: 0.1100
Episode 8970, loss: 22.3789, total reward: 10.3500
Episode 8971, loss: -0.2620, total reward: 0.1100
Episode 8972, loss: 0.7676, total reward: 0.6300
Episode 8973, loss: 12.3584, total reward: 10.1400
Episode 8974, loss: -0.9113, total reward: 0.1100
Episode 8975, loss: -0.6079, total reward: -0.0400
Episode 8976, loss: -0.3205, total reward: 0.3200
Episode 8977, loss: 0.0135, total reward: 0.2200
Episode 8978, loss: 49.6186, total reward: 10.5100
Episode 8979, loss: 19.0861, total reward: 10.2800
Episode 8980, loss: 124.4685, total reward: 10.6300
Episode 8981, loss: -0.0266, total reward: 0.2200
Episode 8982, loss: -0.7118, total reward: -0.2100
Episode 8983, loss: 0.1544, total reward: 0.5200
Episode 8984, loss: 19.8042, total reward: 10.6400
Episode 8985, loss: 30.4009, total reward: 1

Episode 9127, loss: 2.7763, total reward: 10.1900
Episode 9128, loss: 13.2854, total reward: 10.1300
Episode 9129, loss: 2.7812, total reward: 1.1400
Episode 9130, loss: 42.6011, total reward: 10.7000
Episode 9131, loss: 1.1169, total reward: 0.4200
Episode 9132, loss: 46.3294, total reward: 10.5100
Episode 9133, loss: 13.1600, total reward: 10.4200
Episode 9134, loss: 47.1189, total reward: 10.5900
Episode 9135, loss: 23.4038, total reward: 10.5300
Episode 9136, loss: 45.5771, total reward: 10.2200
Episode 9137, loss: 9.1861, total reward: 10.4200
Episode 9138, loss: 13.8837, total reward: 10.1900
Episode 9139, loss: 3.4607, total reward: 10.4100
Episode 9140, loss: 8.4385, total reward: 10.4600
Episode 9141, loss: 18.5620, total reward: 10.6400
Episode 9142, loss: 29.6492, total reward: 10.4500
Episode 9143, loss: 34.7695, total reward: 10.5600
Episode 9144, loss: 63.6713, total reward: 10.8400
Episode 9145, loss: 5.7404, total reward: 10.4200
Episode 9146, loss: 37.7046, total rewar

Episode 9290, loss: 52.7164, total reward: 10.5600
Episode 9291, loss: 26.2908, total reward: 10.6800
Episode 9292, loss: 18.7667, total reward: 10.3600
Episode 9293, loss: 42.0391, total reward: 10.6500
Episode 9294, loss: 45.6769, total reward: 10.4800
Episode 9295, loss: 30.1505, total reward: 10.3700
Episode 9296, loss: 39.4936, total reward: 10.6500
Episode 9297, loss: 46.8478, total reward: 10.4800
Episode 9298, loss: 12.0808, total reward: 10.3900
Episode 9299, loss: 21.9265, total reward: 10.8400
Episode 9300, loss: 17.2594, total reward: 10.1200
Episode 9301, loss: 18.2214, total reward: 10.8400
Episode 9302, loss: 11.6467, total reward: 10.2900
Episode 9303, loss: 3.8515, total reward: 10.5700
Episode 9304, loss: 4.4263, total reward: 10.6500
Episode 9305, loss: 29.0617, total reward: 10.0500
Episode 9306, loss: 5.9386, total reward: 10.1300
Episode 9307, loss: 5.8405, total reward: 10.4200
Episode 9308, loss: 20.8957, total reward: 10.2000
Episode 9309, loss: 41.8232, total 

Episode 9453, loss: 0.4259, total reward: 0.6300
Episode 9454, loss: 21.5833, total reward: 10.1100
Episode 9455, loss: 0.0876, total reward: 0.3200
Episode 9456, loss: 0.1462, total reward: 0.2700
Episode 9457, loss: 4.2679, total reward: 10.4200
Episode 9458, loss: 13.7718, total reward: 10.4300
Episode 9459, loss: -0.5203, total reward: -0.2700
Episode 9460, loss: 8.8103, total reward: 10.5500
Episode 9461, loss: 11.3777, total reward: 10.3000
Episode 9462, loss: 17.0039, total reward: 10.2400
Episode 9463, loss: 12.3819, total reward: 10.7100
Episode 9464, loss: 22.0590, total reward: 10.7800
Episode 9465, loss: 0.0103, total reward: 0.5200
Episode 9466, loss: 0.2167, total reward: 0.3200
Episode 9467, loss: 17.0122, total reward: 10.1900
Episode 9468, loss: 2.2852, total reward: 10.4200
Episode 9469, loss: 9.0575, total reward: 9.8800
Episode 9470, loss: 41.7074, total reward: 10.8500
Episode 9471, loss: 0.7280, total reward: 0.8300
Episode 9472, loss: 0.5077, total reward: 0.3200

Episode 9617, loss: 4.3652, total reward: 10.1600
Episode 9618, loss: 12.9954, total reward: 10.3700
Episode 9619, loss: 0.0977, total reward: 10.0300
Episode 9620, loss: 0.4876, total reward: 0.6300
Episode 9621, loss: -0.2072, total reward: 0.1100
Episode 9622, loss: 0.1549, total reward: 0.4200
Episode 9623, loss: 22.8790, total reward: 10.2400
Episode 9624, loss: 1.5704, total reward: 10.4200
Episode 9625, loss: 0.1945, total reward: 0.3200
Episode 9626, loss: 1.0139, total reward: 0.6300
Episode 9627, loss: 0.8818, total reward: 0.6300
Episode 9628, loss: 7.7453, total reward: 10.2200
Episode 9629, loss: 19.7658, total reward: 10.2000
Episode 9630, loss: 0.7073, total reward: 0.7300
Episode 9631, loss: 3.0374, total reward: 10.6700
Episode 9632, loss: 0.1686, total reward: 0.5200
Episode 9633, loss: 1.3525, total reward: 0.6300
Episode 9634, loss: 29.2225, total reward: 10.7600
Episode 9635, loss: 10.3393, total reward: 9.9200
Episode 9636, loss: 1.0501, total reward: 0.6300
Episo

Episode 9781, loss: 0.2816, total reward: 0.6300
Episode 9782, loss: 49.8947, total reward: 10.3900
Episode 9783, loss: 0.6313, total reward: 0.5200
Episode 9784, loss: 0.5243, total reward: 0.5200
Episode 9785, loss: 0.6111, total reward: 0.5200
Episode 9786, loss: 5.6058, total reward: 10.0700
Episode 9787, loss: 0.1501, total reward: 0.6300
Episode 9788, loss: 0.3150, total reward: 0.3200
Episode 9789, loss: 4.2708, total reward: 10.2800
Episode 9790, loss: 12.2707, total reward: 10.2400
Episode 9791, loss: 2.3401, total reward: 10.3900
Episode 9792, loss: 26.5544, total reward: 10.4000
Episode 9793, loss: 18.9556, total reward: 10.1400
Episode 9794, loss: 19.2467, total reward: 10.4100
Episode 9795, loss: 28.0769, total reward: 10.4700
Episode 9796, loss: 16.8317, total reward: 9.9100
Episode 9797, loss: 24.1084, total reward: 10.3700
Episode 9798, loss: 0.3360, total reward: 0.5200
Episode 9799, loss: 3.8951, total reward: 0.6300
Episode 9800, loss: 0.1677, total reward: 0.4200
Ep

Episode 9945, loss: 0.9667, total reward: 0.5200
Episode 9946, loss: 21.8269, total reward: 10.2600
Episode 9947, loss: 13.9358, total reward: 10.3700
Episode 9948, loss: 10.3396, total reward: 10.6400
Episode 9949, loss: 23.0601, total reward: 10.2400
Episode 9950, loss: 21.0940, total reward: 10.4500
Episode 9951, loss: 3.6878, total reward: 10.3400
Episode 9952, loss: 17.0081, total reward: 10.1800
Episode 9953, loss: 42.3894, total reward: 10.4200
Episode 9954, loss: 0.5373, total reward: 0.6900
Episode 9955, loss: 10.8557, total reward: 10.5300
Episode 9956, loss: 12.9692, total reward: 10.5400
Episode 9957, loss: 21.8665, total reward: 10.4600
Episode 9958, loss: 16.0158, total reward: 10.1600
Episode 9959, loss: 28.7157, total reward: 10.0500
Episode 9960, loss: 16.7586, total reward: 10.5200
Episode 9961, loss: 14.9496, total reward: 10.3600
Episode 9962, loss: 12.8087, total reward: 10.7300
Episode 9963, loss: 12.4400, total reward: 10.8400
Episode 9964, loss: 1.5303, total re

Episode 10104, loss: 0.1513, total reward: 0.5100
Episode 10105, loss: 2.7841, total reward: 10.5000
Episode 10106, loss: 9.3207, total reward: 9.9700
Episode 10107, loss: 0.3606, total reward: 0.5200
Episode 10108, loss: 0.0415, total reward: 0.3200
Episode 10109, loss: -0.0873, total reward: 0.3200
Episode 10110, loss: 0.2999, total reward: 0.3200
Episode 10111, loss: 44.1623, total reward: 10.3900
Episode 10112, loss: 35.9908, total reward: 10.1500
Episode 10113, loss: 25.9543, total reward: 10.0500
Episode 10114, loss: 0.1232, total reward: 0.3200
Episode 10115, loss: 35.3712, total reward: 10.3400
Episode 10116, loss: 7.5624, total reward: 10.4500
Episode 10117, loss: 26.0870, total reward: 10.6200
Episode 10118, loss: 42.6519, total reward: 10.4300
Episode 10119, loss: 11.6735, total reward: 10.4100
Episode 10120, loss: 3.6815, total reward: 10.2000
Episode 10121, loss: 0.0074, total reward: 0.2200
Episode 10122, loss: 44.3565, total reward: 10.2300
Episode 10123, loss: 1.0053, t

Episode 10265, loss: -0.0116, total reward: 0.2200
Episode 10266, loss: 0.0745, total reward: 10.0600
Episode 10267, loss: -0.2759, total reward: 0.1100
Episode 10268, loss: -0.3359, total reward: -0.1900
Episode 10269, loss: 0.8536, total reward: 0.5200
Episode 10270, loss: 0.9103, total reward: 0.6300
Episode 10271, loss: 10.8347, total reward: 10.6800
Episode 10272, loss: 33.7882, total reward: 10.6700
Episode 10273, loss: -0.1389, total reward: 0.1100
Episode 10274, loss: 0.2887, total reward: 0.5200
Episode 10275, loss: 17.7337, total reward: 10.4500
Episode 10276, loss: 13.5181, total reward: 10.4100
Episode 10277, loss: 0.2168, total reward: 0.1100
Episode 10278, loss: 10.2103, total reward: 10.2000
Episode 10279, loss: 17.4184, total reward: 10.2200
Episode 10280, loss: 0.3095, total reward: 0.3200
Episode 10281, loss: 30.2082, total reward: 10.6600
Episode 10282, loss: 11.4221, total reward: 10.2400
Episode 10283, loss: -0.1055, total reward: 0.2200
Episode 10284, loss: 36.139

Episode 10426, loss: 0.2042, total reward: 0.7300
Episode 10427, loss: 0.0191, total reward: 0.1100
Episode 10428, loss: 27.3237, total reward: 10.5300
Episode 10429, loss: 16.5252, total reward: 9.9700
Episode 10430, loss: 0.6523, total reward: 0.3200
Episode 10431, loss: 52.0466, total reward: 10.6800
Episode 10432, loss: 0.3687, total reward: 0.3200
Episode 10433, loss: 16.1251, total reward: 10.4800
Episode 10434, loss: -0.0416, total reward: 0.4000
Episode 10435, loss: 36.1032, total reward: 10.5800
Episode 10436, loss: 0.4158, total reward: 0.7300
Episode 10437, loss: 28.6084, total reward: 10.3400
Episode 10438, loss: 0.3487, total reward: 0.3200
Episode 10439, loss: 10.7267, total reward: 10.2400
Episode 10440, loss: 0.2324, total reward: 0.5200
Episode 10441, loss: 0.0076, total reward: 0.3200
Episode 10442, loss: 3.4599, total reward: 10.5700
Episode 10443, loss: 0.0433, total reward: 0.2200
Episode 10444, loss: 1.1428, total reward: 0.4200
Episode 10445, loss: 43.9475, total

Episode 10587, loss: 5.3530, total reward: 10.1200
Episode 10588, loss: 4.7505, total reward: 10.3200
Episode 10589, loss: 0.8198, total reward: 0.3200
Episode 10590, loss: 36.4424, total reward: 10.5300
Episode 10591, loss: 3.7501, total reward: 10.1600
Episode 10592, loss: 2.6078, total reward: 10.2900
Episode 10593, loss: 0.2570, total reward: 0.2200
Episode 10594, loss: 18.4086, total reward: 10.5700
Episode 10595, loss: 0.5052, total reward: 0.8300
Episode 10596, loss: 12.2916, total reward: 10.6800
Episode 10597, loss: 38.6223, total reward: 10.8400
Episode 10598, loss: 8.8147, total reward: 9.9400
Episode 10599, loss: 10.5801, total reward: 10.6000
Episode 10600, loss: 17.4692, total reward: 10.6800
Episode 10601, loss: 0.6855, total reward: 0.6300
Episode 10602, loss: 17.6379, total reward: 10.3300
Episode 10603, loss: -0.1338, total reward: 0.3200
Episode 10604, loss: 4.8815, total reward: 10.1000
Episode 10605, loss: 0.1730, total reward: 0.4200
Episode 10606, loss: 35.6999, 

Episode 10748, loss: 19.6495, total reward: 10.6300
Episode 10749, loss: 6.6634, total reward: 10.2200
Episode 10750, loss: 1.0503, total reward: 0.7300
Episode 10751, loss: 1.5239, total reward: 0.7300
Episode 10752, loss: 35.7149, total reward: 10.2100
Episode 10753, loss: 0.4071, total reward: 0.7300
Episode 10754, loss: 2.5346, total reward: 10.3800
Episode 10755, loss: 22.0301, total reward: 10.1500
Episode 10756, loss: 23.2957, total reward: 10.3300
Episode 10757, loss: 0.7617, total reward: 0.7300
Episode 10758, loss: 38.4875, total reward: 10.2600
Episode 10759, loss: 4.7162, total reward: 10.4200
Episode 10760, loss: 0.7131, total reward: 10.2600
Episode 10761, loss: 0.1079, total reward: 0.3200
Episode 10762, loss: 27.5093, total reward: 10.2200
Episode 10763, loss: 37.0171, total reward: 10.1500
Episode 10764, loss: 0.5332, total reward: 10.3600
Episode 10765, loss: 0.5866, total reward: 0.9300
Episode 10766, loss: 2.0675, total reward: 0.6300
Episode 10767, loss: 0.6747, to

Episode 10910, loss: 0.1913, total reward: 0.4200
Episode 10911, loss: 0.6390, total reward: 0.2200
Episode 10912, loss: 0.2468, total reward: 0.0100
Episode 10913, loss: 0.2461, total reward: 0.2200
Episode 10914, loss: 0.2923, total reward: 0.5200
Episode 10915, loss: 0.2707, total reward: 0.4200
Episode 10916, loss: 19.1291, total reward: 10.2300
Episode 10917, loss: -0.2077, total reward: -0.3800
Episode 10918, loss: 4.3154, total reward: 10.2200
Episode 10919, loss: 2.9321, total reward: 10.4000
Episode 10920, loss: 7.5424, total reward: 10.6100
Episode 10921, loss: 1.9854, total reward: 10.2700
Episode 10922, loss: 0.1717, total reward: 0.4900
Episode 10923, loss: 0.7045, total reward: 0.5200
Episode 10924, loss: 0.0969, total reward: 0.3200
Episode 10925, loss: 42.0053, total reward: 10.3500
Episode 10926, loss: 0.0878, total reward: 0.2200
Episode 10927, loss: 0.5092, total reward: 0.3200
Episode 10928, loss: 0.7268, total reward: 0.2200
Episode 10929, loss: -0.3032, total rewa

Episode 11068, loss: 46.0627, total reward: 10.4800
Episode 11069, loss: 16.0896, total reward: 10.4500
Episode 11070, loss: 0.0826, total reward: 0.5200
Episode 11071, loss: 1.4143, total reward: 0.9300
Episode 11072, loss: 3.7811, total reward: 10.3200
Episode 11073, loss: 7.7435, total reward: 10.9000
Episode 11074, loss: 0.1881, total reward: 0.6300
Episode 11075, loss: -0.0424, total reward: 0.4200
Episode 11076, loss: 21.1295, total reward: 10.8600
Episode 11077, loss: 67.6108, total reward: 10.3500
Episode 11078, loss: 22.3805, total reward: 10.5700
Episode 11079, loss: 8.4641, total reward: 10.3100
Episode 11080, loss: 0.8986, total reward: 10.3100
Episode 11081, loss: 1.9422, total reward: 0.4900
Episode 11082, loss: 9.4390, total reward: 10.5100
Episode 11083, loss: 12.3133, total reward: 10.5900
Episode 11084, loss: 1.4761, total reward: 0.6300
Episode 11085, loss: 18.4465, total reward: 10.3700
Episode 11086, loss: 40.6975, total reward: 10.5600
Episode 11087, loss: 35.7412

Episode 11229, loss: 42.2032, total reward: 10.4900
Episode 11230, loss: 20.2478, total reward: 10.1100
Episode 11231, loss: 19.4933, total reward: 10.7600
Episode 11232, loss: 1.0461, total reward: 0.5200
Episode 11233, loss: 0.2811, total reward: 0.8300
Episode 11234, loss: 11.0079, total reward: 10.1800
Episode 11235, loss: 0.4425, total reward: 10.4300
Episode 11236, loss: 16.2817, total reward: 10.7300
Episode 11237, loss: 37.5037, total reward: 10.4200
Episode 11238, loss: 32.0606, total reward: 10.4200
Episode 11239, loss: 22.8868, total reward: 10.4500
Episode 11240, loss: 1.9012, total reward: 10.2500
Episode 11241, loss: 5.0246, total reward: 10.8800
Episode 11242, loss: 5.8229, total reward: 10.4700
Episode 11243, loss: 14.0441, total reward: 10.5600
Episode 11244, loss: 0.2302, total reward: 0.4200
Episode 11245, loss: 0.7932, total reward: 10.1000
Episode 11246, loss: 29.5908, total reward: 10.2800
Episode 11247, loss: 0.0374, total reward: 0.3200
Episode 11248, loss: 18.2

Episode 11390, loss: 6.1660, total reward: 10.4200
Episode 11391, loss: 0.2568, total reward: 0.6300
Episode 11392, loss: 59.8028, total reward: 10.3400
Episode 11393, loss: 37.4292, total reward: 10.3200
Episode 11394, loss: 8.8974, total reward: 10.1800
Episode 11395, loss: 9.0604, total reward: 10.3300
Episode 11396, loss: 0.9241, total reward: 0.6300
Episode 11397, loss: 5.3745, total reward: 9.9900
Episode 11398, loss: 12.5861, total reward: 10.2600
Episode 11399, loss: -0.3350, total reward: 0.1100
Episode 11400, loss: 4.4046, total reward: 10.5000
Episode 11401, loss: 0.0375, total reward: 0.8300
Episode 11402, loss: 4.6727, total reward: 10.5500
Episode 11403, loss: 34.1634, total reward: 9.9500
Episode 11404, loss: 0.5528, total reward: 0.7300
Episode 11405, loss: 0.2013, total reward: 10.4200
Episode 11406, loss: 0.5984, total reward: 0.8100
Episode 11407, loss: 0.3173, total reward: 0.4200
Episode 11408, loss: 0.0576, total reward: 0.5200
Episode 11409, loss: 10.7672, total 

Episode 11551, loss: 0.4856, total reward: 0.6300
Episode 11552, loss: 21.6162, total reward: 10.6400
Episode 11553, loss: 0.0468, total reward: 0.4200
Episode 11554, loss: 5.3695, total reward: 10.0900
Episode 11555, loss: -0.0098, total reward: 0.2200
Episode 11556, loss: 23.3379, total reward: 10.3200
Episode 11557, loss: 54.1153, total reward: 10.4300
Episode 11558, loss: 0.0810, total reward: 0.5100
Episode 11559, loss: 5.1603, total reward: 10.4300
Episode 11560, loss: 0.3203, total reward: 10.4300
Episode 11561, loss: 0.5121, total reward: 0.6300
Episode 11562, loss: 4.7734, total reward: 10.4300
Episode 11563, loss: 26.6381, total reward: 10.7400
Episode 11564, loss: 10.3292, total reward: 10.5500
Episode 11565, loss: 2.0974, total reward: 10.6500
Episode 11566, loss: 5.0480, total reward: 10.1300
Episode 11567, loss: 9.6303, total reward: 9.9500
Episode 11568, loss: 36.3803, total reward: 10.1300
Episode 11569, loss: 9.8475, total reward: 10.3900
Episode 11570, loss: 3.6431, t

Episode 11712, loss: 24.2221, total reward: 10.0000
Episode 11713, loss: 3.0139, total reward: 10.7600
Episode 11714, loss: 0.8590, total reward: 0.5200
Episode 11715, loss: 17.7667, total reward: 10.5600
Episode 11716, loss: 0.0885, total reward: 0.4200
Episode 11717, loss: 0.4368, total reward: 10.3700
Episode 11718, loss: 13.3551, total reward: 10.3900
Episode 11719, loss: 5.4909, total reward: 10.0600
Episode 11720, loss: -0.0533, total reward: 0.0100
Episode 11721, loss: 0.0727, total reward: 10.0500
Episode 11722, loss: 31.9418, total reward: 10.2100
Episode 11723, loss: 12.1965, total reward: 10.5400
Episode 11724, loss: 2.1539, total reward: 10.3300
Episode 11725, loss: 13.2940, total reward: 10.4200
Episode 11726, loss: 0.5752, total reward: 10.5000
Episode 11727, loss: 6.7507, total reward: 10.1800
Episode 11728, loss: 5.3509, total reward: 10.1200
Episode 11729, loss: 8.3873, total reward: 10.1200
Episode 11730, loss: 8.9661, total reward: 10.3800
Episode 11731, loss: 19.458

Episode 11873, loss: 0.3660, total reward: 10.3100
Episode 11874, loss: 0.1039, total reward: 0.1100
Episode 11875, loss: 0.0173, total reward: 0.1100
Episode 11876, loss: 30.2077, total reward: 10.3500
Episode 11877, loss: 0.0031, total reward: 0.5200
Episode 11878, loss: 20.2451, total reward: 10.6600
Episode 11879, loss: 3.2679, total reward: 10.5500
Episode 11880, loss: 1.2240, total reward: 0.9300
Episode 11881, loss: -0.0331, total reward: 0.4000
Episode 11882, loss: 14.5274, total reward: 10.5500
Episode 11883, loss: 0.0879, total reward: 0.5200
Episode 11884, loss: 6.9644, total reward: 10.5500
Episode 11885, loss: 9.8172, total reward: 10.3100
Episode 11886, loss: 1.2785, total reward: 9.9300
Episode 11887, loss: 2.8903, total reward: 10.5400
Episode 11888, loss: 1.7470, total reward: 10.4100
Episode 11889, loss: -0.0393, total reward: 0.5100
Episode 11890, loss: 0.2101, total reward: 0.5900
Episode 11891, loss: 4.1824, total reward: 10.2200
Episode 11892, loss: 14.8398, total

Episode 12031, loss: 2.4713, total reward: 10.6200
Episode 12032, loss: 2.9423, total reward: 10.1900
Episode 12033, loss: 1.8021, total reward: 10.4500
Episode 12034, loss: 14.2909, total reward: 10.4500
Episode 12035, loss: 8.6696, total reward: 10.5800
Episode 12036, loss: 3.7791, total reward: 10.1400
Episode 12037, loss: 0.2595, total reward: 0.1100
Episode 12038, loss: 0.1228, total reward: 0.6800
Episode 12039, loss: 15.7883, total reward: 10.2300
Episode 12040, loss: 0.4672, total reward: 10.5800
Episode 12041, loss: 7.2405, total reward: 10.5100
Episode 12042, loss: 31.7150, total reward: 10.5600
Episode 12043, loss: 5.7332, total reward: 10.1400
Episode 12044, loss: 1.8637, total reward: 10.1200
Episode 12045, loss: 11.3549, total reward: 10.2600
Episode 12046, loss: 1.1196, total reward: 9.9600
Episode 12047, loss: 2.6682, total reward: 10.2500
Episode 12048, loss: 0.0186, total reward: 0.3200
Episode 12049, loss: 0.1243, total reward: 0.4200
Episode 12050, loss: 5.3266, tot

Episode 12193, loss: 0.0413, total reward: 0.3200
Episode 12194, loss: 1.3103, total reward: 10.1100
Episode 12195, loss: 0.0025, total reward: 0.2200
Episode 12196, loss: 0.8477, total reward: 0.5200
Episode 12197, loss: 11.9896, total reward: 10.2700
Episode 12198, loss: 0.5709, total reward: 0.3200
Episode 12199, loss: 0.4386, total reward: 10.0200
Episode 12200, loss: 0.1671, total reward: -0.0900
Episode 12201, loss: -0.4818, total reward: -0.0900
Episode 12202, loss: 0.1265, total reward: 0.5200
Episode 12203, loss: -0.0016, total reward: 0.0100
Episode 12204, loss: 6.2508, total reward: 10.3200
Episode 12205, loss: 0.1394, total reward: 0.8300
Episode 12206, loss: -0.0917, total reward: 0.1300
Episode 12207, loss: 0.0582, total reward: 0.3200
Episode 12208, loss: 0.0715, total reward: 0.1100
Episode 12209, loss: 0.0384, total reward: 0.2200
Episode 12210, loss: 0.0961, total reward: 0.3200
Episode 12211, loss: -0.0117, total reward: -0.0900
Episode 12212, loss: 0.7523, total rew

Episode 12355, loss: 0.3540, total reward: 0.6300
Episode 12356, loss: 33.4138, total reward: 10.1600
Episode 12357, loss: 9.6713, total reward: 10.6000
Episode 12358, loss: 20.9249, total reward: 10.5500
Episode 12359, loss: 1.6419, total reward: 10.4200
Episode 12360, loss: 8.3890, total reward: 10.3900
Episode 12361, loss: 8.5309, total reward: 10.5200
Episode 12362, loss: 7.2386, total reward: 10.9500
Episode 12363, loss: 32.8277, total reward: 10.7600
Episode 12364, loss: 0.8267, total reward: 10.7400
Episode 12365, loss: 0.7035, total reward: 0.7300
Episode 12366, loss: 5.6245, total reward: 10.3700
Episode 12367, loss: 0.9725, total reward: 0.6300
Episode 12368, loss: 0.7561, total reward: 10.6400
Episode 12369, loss: 1.0009, total reward: 10.1400
Episode 12370, loss: 9.9437, total reward: 10.4500
Episode 12371, loss: 11.8914, total reward: 10.1300
Episode 12372, loss: -0.0047, total reward: 0.1500
Episode 12373, loss: 0.5384, total reward: 10.2000
Episode 12374, loss: 2.4242, t

Episode 12517, loss: 1.1516, total reward: 10.5800
Episode 12518, loss: 1.5524, total reward: 10.5200
Episode 12519, loss: 14.2142, total reward: 10.2500
Episode 12520, loss: 0.0071, total reward: 0.2200
Episode 12521, loss: 30.7240, total reward: 10.1500
Episode 12522, loss: 0.0119, total reward: 10.1400
Episode 12523, loss: 0.0026, total reward: 0.4200
Episode 12524, loss: 8.0983, total reward: 10.7700
Episode 12525, loss: 0.7445, total reward: 0.4200
Episode 12526, loss: 0.0558, total reward: 10.4100
Episode 12527, loss: 0.1345, total reward: 0.8300
Episode 12528, loss: -0.1642, total reward: 0.2200
Episode 12529, loss: 6.1930, total reward: 10.4500
Episode 12530, loss: 9.4968, total reward: 10.3100
Episode 12531, loss: 12.2095, total reward: 10.7600
Episode 12532, loss: 1.7832, total reward: 10.5500
Episode 12533, loss: 1.4413, total reward: 10.3300
Episode 12534, loss: 0.0306, total reward: 0.5200
Episode 12535, loss: 0.8799, total reward: 10.8600
Episode 12536, loss: 0.0107, tota

Episode 12678, loss: 0.4932, total reward: 0.7300
Episode 12679, loss: 1.1113, total reward: 10.6400
Episode 12680, loss: 0.0624, total reward: 0.3200
Episode 12681, loss: 8.2041, total reward: 10.3500
Episode 12682, loss: 1.4694, total reward: 10.5200
Episode 12683, loss: 0.6753, total reward: 10.4900
Episode 12684, loss: 8.3864, total reward: 10.3400
Episode 12685, loss: 0.3419, total reward: 10.1300
Episode 12686, loss: 7.5646, total reward: 10.6800
Episode 12687, loss: 0.7491, total reward: 0.5200
Episode 12688, loss: 10.8762, total reward: 10.3500
Episode 12689, loss: 0.0259, total reward: 10.1500
Episode 12690, loss: 0.2900, total reward: 0.6300
Episode 12691, loss: 2.5518, total reward: 10.4400
Episode 12692, loss: 3.0867, total reward: 10.1800
Episode 12693, loss: 9.9956, total reward: 10.2300
Episode 12694, loss: 0.0162, total reward: 10.3200
Episode 12695, loss: 1.1089, total reward: 10.3300
Episode 12696, loss: 1.3537, total reward: 10.3200
Episode 12697, loss: 0.0979, total

Episode 12839, loss: -0.1276, total reward: -0.0900
Episode 12840, loss: -0.2028, total reward: -0.0900
Episode 12841, loss: -0.0793, total reward: 0.2200
Episode 12842, loss: -0.0488, total reward: 0.3200
Episode 12843, loss: 0.0188, total reward: 0.3200
Episode 12844, loss: -0.2301, total reward: 0.1100
Episode 12845, loss: -0.4039, total reward: 0.0100
Episode 12846, loss: -0.1277, total reward: -0.0900
Episode 12847, loss: -0.1254, total reward: -0.3200
Episode 12848, loss: -0.2891, total reward: 0.1100
Episode 12849, loss: -0.3376, total reward: -0.1900
Episode 12850, loss: 0.4074, total reward: 0.2200
Episode 12851, loss: -0.1899, total reward: 0.2200
Episode 12852, loss: 5.6205, total reward: 10.1300
Episode 12853, loss: -0.0465, total reward: -0.0900
Episode 12854, loss: -0.2950, total reward: -0.0400
Episode 12855, loss: 0.0803, total reward: 0.2200
Episode 12856, loss: -0.1290, total reward: 0.1100
Episode 12857, loss: -0.0515, total reward: 0.3200
Episode 12858, loss: 16.682

Checkpoint saved at episode 13000 to datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120/ckpt_13000_5.7069.pth
Episode 13000, loss: -0.1079, total reward: 0.0100
Episode 13001, loss: -0.1172, total reward: 0.2200
Episode 13002, loss: 0.3900, total reward: 0.2200
Episode 13003, loss: -0.3360, total reward: -0.0900
Episode 13004, loss: 19.2550, total reward: 10.2200
Episode 13005, loss: 0.1638, total reward: 0.0100
Episode 13006, loss: -0.1950, total reward: -0.0900
Episode 13007, loss: 23.7445, total reward: 10.2100
Episode 13008, loss: -0.5816, total reward: 0.0100
Episode 13009, loss: -0.3847, total reward: -0.0900
Episode 13010, loss: 0.0815, total reward: 0.1800
Episode 13011, loss: -0.0045, total reward: 0.2200
Episode 13012, loss: 0.2836, total reward: 0.0100
Episode 13013, loss: -0.3469, total reward: -0.0900
Episode 13014, loss: 0.0159, total reward: 0.0100
Episode 13015, loss: -1.1130, total reward: -0.0900
Episode 13016, loss: 55.1761, total reward: 9.970

Episode 13159, loss: -0.1124, total reward: 0.0100
Episode 13160, loss: 0.0982, total reward: 0.1100
Episode 13161, loss: -2.1570, total reward: -0.2700
Episode 13162, loss: -0.1010, total reward: 0.2200
Episode 13163, loss: -0.9739, total reward: -0.1900
Episode 13164, loss: 0.8470, total reward: 0.1100
Episode 13165, loss: 10.3192, total reward: 10.2300
Episode 13166, loss: -0.5434, total reward: 0.1100
Episode 13167, loss: -0.2568, total reward: -0.0900
Episode 13168, loss: -0.4175, total reward: 0.0100
Episode 13169, loss: -0.1993, total reward: 0.0100
Episode 13170, loss: 0.1328, total reward: 0.1100
Episode 13171, loss: -0.1276, total reward: 0.1100
Episode 13172, loss: -1.2092, total reward: 0.0100
Episode 13173, loss: -2.6018, total reward: -0.2500
Episode 13174, loss: 0.0728, total reward: 0.2200
Episode 13175, loss: 0.1662, total reward: 0.1100
Episode 13176, loss: -0.0379, total reward: 0.0100
Episode 13177, loss: -1.4786, total reward: -0.0900
Episode 13178, loss: 0.1352, t

Episode 13320, loss: 4.4179, total reward: 10.5800
Episode 13321, loss: 0.2781, total reward: 10.3400
Episode 13322, loss: 3.0777, total reward: 10.0300
Episode 13323, loss: 8.2284, total reward: 10.2900
Episode 13324, loss: 4.2914, total reward: 10.3900
Episode 13325, loss: 4.8768, total reward: 10.4800
Episode 13326, loss: 10.7341, total reward: 10.6700
Episode 13327, loss: 1.5656, total reward: 10.4700
Episode 13328, loss: 0.6661, total reward: 9.8900
Episode 13329, loss: -0.0247, total reward: 0.0100
Episode 13330, loss: 15.3455, total reward: 10.2800
Episode 13331, loss: 4.6246, total reward: 10.1700
Episode 13332, loss: 11.0253, total reward: 10.3200
Episode 13333, loss: 0.7422, total reward: 0.8300
Episode 13334, loss: 1.5245, total reward: 10.3400
Episode 13335, loss: 6.7287, total reward: 10.1000
Episode 13336, loss: 11.4797, total reward: 10.4500
Episode 13337, loss: 11.3996, total reward: 10.2400
Episode 13338, loss: 1.6154, total reward: 10.4500
Episode 13339, loss: 2.4378,

Episode 13481, loss: 8.3127, total reward: 10.2900
Episode 13482, loss: 5.2229, total reward: 10.4800
Episode 13483, loss: 2.9147, total reward: 9.9700
Episode 13484, loss: 2.3766, total reward: 10.5300
Episode 13485, loss: 0.0589, total reward: 0.4200
Episode 13486, loss: 12.3734, total reward: 10.2800
Episode 13487, loss: 5.4349, total reward: 10.6200
Episode 13488, loss: 3.3143, total reward: 10.3000
Episode 13489, loss: 0.0701, total reward: 0.8300
Episode 13490, loss: 34.5609, total reward: 10.3900
Episode 13491, loss: 1.9700, total reward: 10.3300
Episode 13492, loss: 1.0010, total reward: 0.5200
Episode 13493, loss: 0.0437, total reward: 0.3200
Episode 13494, loss: 0.1373, total reward: 0.8300
Episode 13495, loss: 4.8861, total reward: 10.5500
Episode 13496, loss: 0.7024, total reward: 0.6300
Episode 13497, loss: 1.7677, total reward: 0.8300
Episode 13498, loss: 0.0272, total reward: 10.9600
Episode 13499, loss: 4.3610, total reward: 10.5800
Episode 13500, loss: 21.4169, total r

Episode 13643, loss: 11.1192, total reward: 10.0900
Episode 13644, loss: 0.0013, total reward: 10.0600
Episode 13645, loss: 2.7901, total reward: 10.3600
Episode 13646, loss: 1.6546, total reward: 0.7300
Episode 13647, loss: 34.0094, total reward: 10.2700
Episode 13648, loss: 0.0173, total reward: 0.2200
Episode 13649, loss: -0.0400, total reward: 0.2200
Episode 13650, loss: 0.0583, total reward: 0.3200
Episode 13651, loss: 20.7918, total reward: 10.2000
Episode 13652, loss: 3.8718, total reward: 10.2800
Episode 13653, loss: 0.7948, total reward: 10.5200
Episode 13654, loss: 24.9848, total reward: 10.4100
Episode 13655, loss: 0.1189, total reward: 10.0700
Episode 13656, loss: 0.2710, total reward: 0.3900
Episode 13657, loss: 31.9193, total reward: 10.4500
Episode 13658, loss: 2.3681, total reward: 10.3800
Episode 13659, loss: 1.3041, total reward: 10.2800
Episode 13660, loss: 0.4013, total reward: 10.0400
Episode 13661, loss: 0.0094, total reward: 0.2200
Episode 13662, loss: 2.9244, to

Episode 13805, loss: 0.0224, total reward: 10.6700
Episode 13806, loss: 0.4245, total reward: 10.4700
Episode 13807, loss: 0.4029, total reward: 10.3300
Episode 13808, loss: -0.0011, total reward: 0.4200
Episode 13809, loss: 0.1105, total reward: 0.7300
Episode 13810, loss: 1.8255, total reward: 10.2800
Episode 13811, loss: 0.0296, total reward: 0.3200
Episode 13812, loss: 28.4974, total reward: 10.0600
Episode 13813, loss: 8.4743, total reward: 10.4500
Episode 13814, loss: 0.3337, total reward: 10.2600
Episode 13815, loss: -0.0409, total reward: 0.2500
Episode 13816, loss: 24.3960, total reward: 10.4500
Episode 13817, loss: 0.2280, total reward: 0.2200
Episode 13818, loss: 6.8325, total reward: 10.5500
Episode 13819, loss: 4.2427, total reward: 10.3700
Episode 13820, loss: 0.0607, total reward: 10.1600
Episode 13821, loss: 27.5306, total reward: 10.5000
Episode 13822, loss: 0.6488, total reward: 10.3500
Episode 13823, loss: 0.0072, total reward: 0.8100
Episode 13824, loss: 6.1805, tot

Episode 13966, loss: 43.1509, total reward: 10.1900
Episode 13967, loss: 12.8724, total reward: 10.1500
Episode 13968, loss: 0.0000, total reward: 9.9900
Episode 13969, loss: 19.5241, total reward: 10.5300
