In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import random

MIN_LIST_LEN = 8
MAX_LIST_LEN = 8
MAX_STEPS = 210
SUCCESS_REWARD = 10.0
EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
GAMMA = 0.99
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = 'datasets/rl_sort_transformer_easy/list8_transformer3_128_gamma099_step210'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    '0': 2,
    '1': 3,
    '2': 4,
    '3': 5,
    '4': 6,
    '5': 7,
    '6': 8,
    '7': 9,
    'less': 10,
    'equal': 11,
    'more': 12,
    'len1': 13,
    'len2': 14,
    'len3': 15,
    'len4': 16,
    'len5': 17,
    'len6': 18,
    'len7': 19,
    'len8': 20,
}
inv_vocab = {v: k for k, v in vocab.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        initial_token = 'len{}'.format(self.length)
        return vocab[initial_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_list_len(self):
        return len(self.list)

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 >= self.length or index2 >= self.length or index1 < 0 or index2 < 0:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = -0.01
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = -0.02
            else:
                response_token = vocab['more']
                reward = -0.01
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = -1.0
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = 0.1
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -0.1
            else:
                reward = -0.01
            self.indices = None
        else:
            reward = -1.0
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [None]:
# Training Loop
def train(verbose=False):
    torch.autograd.set_detect_anomaly(True)  # Detect anomalies in autograd
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    #load_checkpoint("datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098/ckpt_7000_0.9286.pth", model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    for episode in range(NUM_EPISODES):
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, current_list = env.reset()
        input_tokens = [initial_token_id]
        log_probs = []
        rewards = []
        state = 'expect_action'
        done = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            output = model(input_seq)  # (seq_len, batch_size, vocab_size)
            # Get logits for the last token
            logits = output[-1, 0, :]  # (vocab_size)

            # Check for NaNs in logits
            if torch.isnan(logits).any():
                print(f"Episode {episode}, NaNs in logits before masking.")
                break

            # Get valid tokens based on state
            def get_valid_tokens(state):
                action_tokens = [vocab['Comparison'], vocab['Swap']]
                index_tokens = [vocab[str(i)] for i in range(env.length)]
                if state == 'expect_action':
                    return action_tokens
                elif state == 'expect_index1':
                    return index_tokens[:-1]
                elif state == 'expect_index2':
                    return [x for x in index_tokens if x > input_tokens[-1]]
                else:
                    # Handle unexpected states by defaulting to expect_action
                    return action_tokens

            valid_token_ids = get_valid_tokens(state)

            # Ensure valid_token_ids are within the vocab range
            if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                break

            # Mask invalid tokens
            mask_value = -1e9  # Use a large negative value instead of -inf
            mask = torch.full_like(logits, mask_value).to(device)
            mask[valid_token_ids] = 0
            masked_logits = logits + mask
            
            # Sample action. Have some chance to randomly pick a valid action.
            eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
            if random.random() < eps_threshold:
                masked_logits = masked_logits / 4

            # Check for NaNs in masked_logits
            if torch.isnan(masked_logits).any():
                print(f"Episode {episode}, NaNs in masked_logits after masking.")
                break

            # Compute probabilities
            probs = F.softmax(masked_logits, dim=0)

            # Check for NaNs in probs
            if torch.isnan(probs).any():
                print(f"Episode {episode}, NaNs in probs after softmax.")
                break

            try:
                m = torch.distributions.Categorical(probs)
                action_token = m.sample()
                log_prob = m.log_prob(action_token)
            except ValueError as e:
                print(f"Episode {episode}, error in sampling action: {e}")
                break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        # Can't perform Swap without a previous Comparison
                        reward = -1.0
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if verbose:
                        print("Reward:", reward)
                    #rewards.append(reward)
                    state = 'expect_action'
                else:
                    # Invalid action, end the episode
                    reward = -1.0
                    #rewards.append(reward)
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                response_token, reward, done, current_list = env.step(action_tokens)
                if verbose:
                    print("Reward:", reward)
                #rewards.append(reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                # Invalid state, end the episode
                reward = -1.0
                #rewards.append(reward)
                done = True
            #
            rewards.append(reward)
        #
        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / (episode_cnt + 0.00001)
            episode_cnt = 0
            total_reward = 0.0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{avg_reward:.4f}.pth")
        #
        #print(decode(input_tokens, inv_vocab))
        #print(env.get_list())
        #print("rewards:", rewards)
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"
        
        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns
        returns = []
        R = 0
        gamma = GAMMA
        for r in rewards[::-1]:
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        #print(returns)

        # Check for NaNs in returns
        if torch.isnan(returns).any():
            print(f"Episode {episode}, NaNs in returns.")
            continue

#         if returns.std() != 0 and not torch.isnan(returns.std()):
#             returns = (returns - returns.mean()) / (returns.std() + 1e-9)
#         else:
#             pass #returns = returns - returns.mean()

        # Check for NaNs in standardized returns
        if torch.isnan(returns).any():
            print(f"Episode {episode}, NaNs in standardized returns.")
            continue

        # Compute loss
        loss = 0
        #print("log_probs:", log_probs)
        #print("returns:", returns)
        assert len(log_probs) == len(returns), "log_probs and returns have different sizes!"
        for log_prob, R in zip(log_probs, returns):
            loss -= log_prob * R

        # Check for NaNs in loss
        if torch.isnan(loss):
            print(f"Episode {episode}, NaN in loss.")
            continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        if episode % 1 == 0:
            print(f"Episode {episode}, loss: {loss.item():.4f}, total reward: {sum(rewards):.4f}")

if __name__ == "__main__":
    train(verbose=False)




Episode 0, loss: -23.7438, total reward: -0.9600
Episode 1, loss: -8.2705, total reward: -0.9200
Episode 2, loss: -49.2720, total reward: -0.8300
Episode 3, loss: -6.1235, total reward: -0.9200
Episode 4, loss: -25.5703, total reward: -1.0800
Episode 5, loss: -1.5408, total reward: -1.0000
Episode 6, loss: -12.4977, total reward: -0.9400
Episode 7, loss: -12.0577, total reward: -0.9300
Episode 8, loss: -25.3064, total reward: -1.2600
Episode 9, loss: -90.4774, total reward: -1.4700
Episode 10, loss: -0.9747, total reward: -1.0000
Episode 11, loss: -92.2992, total reward: -1.2600
Episode 12, loss: -1.2158, total reward: -1.0000
Episode 13, loss: -87.7318, total reward: -1.3500
Episode 14, loss: -85.9030, total reward: -1.3700
Episode 15, loss: -1.3806, total reward: -1.0000
Episode 16, loss: -40.0651, total reward: -0.8100
Episode 17, loss: -16.6363, total reward: -0.2100
Episode 18, loss: -36.2025, total reward: -0.6100
Episode 19, loss: -13.0180, total reward: -0.3100
Episode 20, loss

Episode 164, loss: -19.2539, total reward: -0.4000
Episode 165, loss: -19.6225, total reward: -0.4600
Episode 166, loss: -13.1475, total reward: -0.4000
Episode 167, loss: -6.5519, total reward: -0.2000
Episode 168, loss: -19.2145, total reward: -0.4200
Episode 169, loss: -61.9480, total reward: -1.4000
Episode 170, loss: -9.8491, total reward: -0.2100
Episode 171, loss: -2.7164, total reward: -1.0000
Episode 172, loss: -19.5869, total reward: -0.5200
Episode 173, loss: -16.3433, total reward: -0.3100
Episode 174, loss: -56.6991, total reward: -1.1700
Episode 175, loss: -11.8551, total reward: -0.3200
Episode 176, loss: -18.9831, total reward: -0.3000
Episode 177, loss: -30.8720, total reward: -0.6200
Episode 178, loss: -24.7453, total reward: -0.6200
Episode 179, loss: -17.1370, total reward: -0.4200
Episode 180, loss: -11.8952, total reward: -0.3200
Episode 181, loss: -17.2916, total reward: -0.3100
Episode 182, loss: -51.0275, total reward: -1.0100
Episode 183, loss: -29.2453, total

Episode 328, loss: 749.3007, total reward: 10.3300
Episode 329, loss: 7.5285, total reward: 0.2100
Episode 330, loss: 13.4468, total reward: 0.3100
Episode 331, loss: -66.4397, total reward: -0.4600
Episode 332, loss: -21.0749, total reward: -0.9700
Episode 333, loss: 9.1101, total reward: 0.4000
Episode 334, loss: 9.0945, total reward: 0.5900
Episode 335, loss: -33.0390, total reward: -0.8500
Episode 336, loss: 12.2236, total reward: 0.2000
Episode 337, loss: 709.4496, total reward: 10.4100
Episode 338, loss: -10.5209, total reward: -0.9300
Episode 339, loss: -5.1837, total reward: 0.2000
Episode 340, loss: 4.9529, total reward: 0.4000
Episode 341, loss: 3.8488, total reward: 0.3000
Episode 342, loss: 14.7049, total reward: 0.4200
Episode 343, loss: 15.5944, total reward: 0.6100
Episode 344, loss: -30.0694, total reward: -0.7300
Episode 345, loss: 25.1903, total reward: 0.4100
Episode 346, loss: 15.9257, total reward: 0.6100
Episode 347, loss: 1.4205, total reward: 0.4000
Episode 348,

Episode 494, loss: 576.0110, total reward: 10.3300
Episode 495, loss: 4.5627, total reward: 0.3000
Episode 496, loss: 3.2746, total reward: 0.1500
Episode 497, loss: -39.6313, total reward: -0.3800
Episode 498, loss: -5.3008, total reward: 0.5100
Episode 499, loss: -4.2608, total reward: -0.1000
Episode 500, loss: -21.9715, total reward: -0.6000
Episode 501, loss: 686.6964, total reward: 10.3000
Episode 502, loss: 678.8643, total reward: 11.0800
Episode 503, loss: -16.9917, total reward: -0.8500
Episode 504, loss: 23.0591, total reward: 1.0300
Episode 505, loss: -2.7500, total reward: 0.2000
Episode 506, loss: 2.3261, total reward: 0.2100
Episode 507, loss: 2.3443, total reward: 0.4100
Episode 508, loss: 2.6717, total reward: 0.2000
Episode 509, loss: 17.6787, total reward: 0.5000
Episode 510, loss: 3.4408, total reward: 0.4000
Episode 511, loss: -0.0421, total reward: 0.3000
Episode 512, loss: -11.3312, total reward: 0.0800
Episode 513, loss: 6.7249, total reward: 0.5000
Episode 514, 

Episode 660, loss: -0.9675, total reward: 0.2900
Episode 661, loss: 22.2861, total reward: 0.7100
Episode 662, loss: 11.4331, total reward: 0.3000
Episode 663, loss: -11.5247, total reward: -0.1100
Episode 664, loss: 29.5327, total reward: 0.7100
Episode 665, loss: 14.3145, total reward: 0.5000
Episode 666, loss: 567.9266, total reward: 10.7400
Episode 667, loss: 2.1936, total reward: 0.2000
Episode 668, loss: -3.6626, total reward: 0.0900
Episode 669, loss: 3.8538, total reward: 0.2000
Episode 670, loss: 4.1095, total reward: 0.2000
Episode 671, loss: 2.0981, total reward: 0.1300
Episode 672, loss: -0.4290, total reward: 0.1000
Episode 673, loss: 484.8794, total reward: 10.3000
Episode 674, loss: 17.4731, total reward: 0.4000
Episode 675, loss: -5.6419, total reward: 0.2000
Episode 676, loss: 4.0221, total reward: 0.5000
Episode 677, loss: 439.4657, total reward: 10.2800
Episode 678, loss: 3.9963, total reward: 0.4000
Episode 679, loss: 17.0796, total reward: 0.4900
Episode 680, loss:

Episode 826, loss: 22.4030, total reward: 0.9100
Episode 827, loss: 591.6357, total reward: 10.5200
Episode 828, loss: 11.0363, total reward: 0.4100
Episode 829, loss: 11.9017, total reward: 0.2800
Episode 830, loss: -8.3114, total reward: -0.9200
Episode 831, loss: 22.4256, total reward: 0.5900
Episode 832, loss: 13.3925, total reward: 0.9100
Episode 833, loss: -14.8083, total reward: -0.1200
Episode 834, loss: 10.0258, total reward: 0.5000
Episode 835, loss: 22.3269, total reward: 0.9100
Episode 836, loss: 745.4657, total reward: 10.7300
Episode 837, loss: 22.4372, total reward: 0.7200
Episode 838, loss: -2.4415, total reward: 0.1800
Episode 839, loss: 674.2295, total reward: 10.6300
Episode 840, loss: -7.4627, total reward: -0.8200
Episode 841, loss: -27.0972, total reward: -0.2900
Episode 842, loss: 623.9779, total reward: 10.6100
Episode 843, loss: 20.2902, total reward: 0.8100
Episode 844, loss: -2.9496, total reward: 0.0900
Episode 845, loss: 6.1813, total reward: 0.2000
Episode

Episode 991, loss: 476.5938, total reward: 10.2200
Episode 992, loss: 1.2601, total reward: 0.3000
Episode 993, loss: 12.7029, total reward: 0.8100
Episode 994, loss: 682.6763, total reward: 10.2700
Episode 995, loss: 540.7328, total reward: 10.6900
Episode 996, loss: 35.6629, total reward: 1.0200
Episode 997, loss: 6.5292, total reward: 0.4700
Episode 998, loss: 15.7708, total reward: 0.8100
Episode 999, loss: 28.5657, total reward: 0.8100
Checkpoint saved at episode 1000 to datasets/rl_sort_transformer_easy/list8_transformer3_128_gamma099_step210/ckpt_1000_1.5105.pth
Episode 1000, loss: 37.0406, total reward: 1.3300
Episode 1001, loss: 625.2566, total reward: 10.2800
Episode 1002, loss: 612.5110, total reward: 10.9900
Episode 1003, loss: 591.2728, total reward: 10.7400
Episode 1004, loss: 536.5583, total reward: 10.6700
Episode 1005, loss: 26.8643, total reward: 0.9100
Episode 1006, loss: 456.9966, total reward: 10.4600
Episode 1007, loss: 4.5162, total reward: 0.4800
Episode 1008, l

Episode 1151, loss: 3.5173, total reward: 0.3800
Episode 1152, loss: 31.3455, total reward: 0.7700
Episode 1153, loss: -5.2549, total reward: 0.2000
Episode 1154, loss: 19.8789, total reward: 0.5400
Episode 1155, loss: 573.3726, total reward: 10.2500
Episode 1156, loss: -4.1752, total reward: 0.1700
Episode 1157, loss: 34.9045, total reward: 0.8100
Episode 1158, loss: -3.5999, total reward: 0.0900
Episode 1159, loss: -1.8924, total reward: 0.0900
Episode 1160, loss: 633.7203, total reward: 10.6400
Episode 1161, loss: 508.5619, total reward: 10.5000
Episode 1162, loss: 685.9933, total reward: 10.1300
Episode 1163, loss: 21.4941, total reward: 0.9100
Episode 1164, loss: 21.2333, total reward: 0.6100
Episode 1165, loss: -11.9875, total reward: 0.0900
Episode 1166, loss: 613.8163, total reward: 10.3300
Episode 1167, loss: -6.2994, total reward: 0.2000
Episode 1168, loss: 517.6212, total reward: 10.4100
Episode 1169, loss: 14.4242, total reward: 0.7100
Episode 1170, loss: -8.8744, total rew

Episode 1314, loss: 692.8121, total reward: 10.3400
Episode 1315, loss: 721.3171, total reward: 10.3400
Episode 1316, loss: 629.3378, total reward: 10.2600
Episode 1317, loss: 615.9164, total reward: 10.4400
Episode 1318, loss: 15.9555, total reward: 0.6100
Episode 1319, loss: 682.9700, total reward: 10.3200
Episode 1320, loss: 24.1603, total reward: 0.6100
Episode 1321, loss: 534.0952, total reward: 10.9000
Episode 1322, loss: 726.1182, total reward: 10.2200
Episode 1323, loss: 626.4240, total reward: 10.4900
Episode 1324, loss: 1.9266, total reward: 0.3000
Episode 1325, loss: -1.5947, total reward: 0.2000
Episode 1326, loss: 18.3860, total reward: 0.7100
Episode 1327, loss: 647.1521, total reward: 10.8800
Episode 1328, loss: 610.9395, total reward: 10.9800
Episode 1329, loss: 9.7632, total reward: 0.6100
Episode 1330, loss: 641.5715, total reward: 10.6000
Episode 1331, loss: 9.5589, total reward: 0.5100
Episode 1332, loss: 350.8838, total reward: 10.5000
Episode 1333, loss: 527.0356,

Episode 1476, loss: 4.2013, total reward: 0.3800
Episode 1477, loss: 244.7622, total reward: 10.0900
Episode 1478, loss: -16.0826, total reward: -0.2200
Episode 1479, loss: 29.2467, total reward: 1.2200
Episode 1480, loss: 22.2605, total reward: 1.1200
Episode 1481, loss: 2.0812, total reward: 0.4000
Episode 1482, loss: 554.9307, total reward: 10.7400
Episode 1483, loss: 573.5751, total reward: 10.4600
Episode 1484, loss: -26.0593, total reward: -0.8900
Episode 1485, loss: -12.1819, total reward: -0.6500
Episode 1486, loss: 1.0930, total reward: 0.5000
Episode 1487, loss: 23.3516, total reward: 1.0200
Episode 1488, loss: 19.3974, total reward: 0.9100
Episode 1489, loss: 29.1351, total reward: 0.8100
Episode 1490, loss: 502.6752, total reward: 10.5200
Episode 1491, loss: 532.9206, total reward: 10.6800
Episode 1492, loss: -0.1541, total reward: 0.5000
Episode 1493, loss: 20.1317, total reward: 0.6100
Episode 1494, loss: -29.6382, total reward: -0.0700
Episode 1495, loss: 633.4770, total

Episode 1637, loss: 32.8855, total reward: 0.7200
Episode 1638, loss: 562.5188, total reward: 10.9200
Episode 1639, loss: 410.4023, total reward: 10.2200
Episode 1640, loss: 12.0951, total reward: 0.7100
Episode 1641, loss: -3.0624, total reward: 0.0900
Episode 1642, loss: 16.7257, total reward: 0.8100
Episode 1643, loss: 445.2908, total reward: 10.7600
Episode 1644, loss: 582.3433, total reward: 10.4500
Episode 1645, loss: 517.7614, total reward: 10.2000
Episode 1646, loss: 8.4654, total reward: 0.4000
Episode 1647, loss: 42.0807, total reward: 1.1200
Episode 1648, loss: 544.2308, total reward: 10.7500
Episode 1649, loss: 460.8096, total reward: 11.0300
Episode 1650, loss: 513.4156, total reward: 10.4600
Episode 1651, loss: 8.0160, total reward: 0.3200
Episode 1652, loss: -9.0740, total reward: 0.2000
Episode 1653, loss: 470.0962, total reward: 10.4900
Episode 1654, loss: 563.1912, total reward: 11.1800
Episode 1655, loss: 457.9173, total reward: 10.7100
Episode 1656, loss: 423.2159, 

Episode 1799, loss: 6.6962, total reward: 0.4000
Episode 1800, loss: 317.7871, total reward: 10.5100
Episode 1801, loss: 455.2339, total reward: 10.6900
Episode 1802, loss: -1.7777, total reward: 0.2000
Episode 1803, loss: 466.3156, total reward: 10.3000
Episode 1804, loss: 550.6601, total reward: 10.7000
Episode 1805, loss: 12.4393, total reward: 0.6100
Episode 1806, loss: 13.2480, total reward: 0.5000
Episode 1807, loss: 26.1798, total reward: 0.8100
Episode 1808, loss: 27.9091, total reward: 1.1200
Episode 1809, loss: -2.1412, total reward: 0.0900
Episode 1810, loss: 7.4632, total reward: 0.3000
Episode 1811, loss: 7.4101, total reward: 0.5000
Episode 1812, loss: 602.1835, total reward: 10.5200
Episode 1813, loss: 28.6457, total reward: 0.9100
Episode 1814, loss: 6.6768, total reward: 0.4100
Episode 1815, loss: 532.2180, total reward: 9.9800
Episode 1816, loss: 301.8887, total reward: 10.6000
Episode 1817, loss: 508.3443, total reward: 10.4000
Episode 1818, loss: 8.8360, total rewar

Episode 1962, loss: 26.1981, total reward: 0.9100
Episode 1963, loss: 492.9653, total reward: 10.4300
Episode 1964, loss: 16.1078, total reward: 0.4000
Episode 1965, loss: 25.4189, total reward: 1.0000
Episode 1966, loss: 468.2427, total reward: 11.0800
Episode 1967, loss: 13.8826, total reward: 0.7100
Episode 1968, loss: 21.7491, total reward: 1.0000
Episode 1969, loss: 19.1233, total reward: 0.7100
Episode 1970, loss: 488.3560, total reward: 10.2900
Episode 1971, loss: 24.9830, total reward: 0.8100
Episode 1972, loss: 13.1623, total reward: 0.6100
Episode 1973, loss: 8.8004, total reward: 0.5800
Episode 1974, loss: 11.5345, total reward: 0.7100
Episode 1975, loss: 482.9799, total reward: 10.9000
Episode 1976, loss: 565.8511, total reward: 11.1100
Episode 1977, loss: 474.9586, total reward: 10.5300
Episode 1978, loss: 499.4875, total reward: 10.5400
Episode 1979, loss: 432.6892, total reward: 10.5300
Episode 1980, loss: -24.8118, total reward: 0.0400
Episode 1981, loss: 10.7925, total

Episode 2121, loss: 517.9854, total reward: 10.6000
Episode 2122, loss: 508.8041, total reward: 10.4700
Episode 2123, loss: 575.7135, total reward: 10.5300
Episode 2124, loss: 32.8233, total reward: 1.2200
Episode 2125, loss: 16.5563, total reward: 0.7100
Episode 2126, loss: 383.9846, total reward: 10.8900
Episode 2127, loss: 485.5167, total reward: 10.4800
Episode 2128, loss: 502.2335, total reward: 10.7100
Episode 2129, loss: 20.7073, total reward: 0.8100
Episode 2130, loss: 10.8879, total reward: 0.6100
Episode 2131, loss: 515.5662, total reward: 10.9200
Episode 2132, loss: 3.8753, total reward: 0.4000
Episode 2133, loss: 35.1664, total reward: 1.2200
Episode 2134, loss: 486.3871, total reward: 10.6700
Episode 2135, loss: 379.6382, total reward: 11.0500
Episode 2136, loss: 26.6568, total reward: 1.1200
Episode 2137, loss: 13.2696, total reward: 0.7100
Episode 2138, loss: 353.7141, total reward: 10.5600
Episode 2139, loss: -1.5028, total reward: 0.3000
Episode 2140, loss: 5.0356, tot

Episode 2283, loss: 16.2043, total reward: 0.4000
Episode 2284, loss: 25.9401, total reward: 0.9200
Episode 2285, loss: 27.5930, total reward: 1.1200
Episode 2286, loss: 16.3138, total reward: 0.7100
Episode 2287, loss: 387.3170, total reward: 10.7700
Episode 2288, loss: 16.5008, total reward: 0.8100
Episode 2289, loss: 400.1574, total reward: 10.3700
Episode 2290, loss: 20.5932, total reward: 1.0200
Episode 2291, loss: 27.1707, total reward: 0.8200
Episode 2292, loss: 0.7116, total reward: 0.5000
Episode 2293, loss: 415.3020, total reward: 10.5700
Episode 2294, loss: 17.4300, total reward: 0.9100
Episode 2295, loss: 4.4664, total reward: 0.4000
Episode 2296, loss: 18.4388, total reward: 0.8100
Episode 2297, loss: 381.6587, total reward: 10.2800
Episode 2298, loss: -7.9056, total reward: -0.0200
Episode 2299, loss: 23.8050, total reward: 0.9100
Episode 2300, loss: 546.8028, total reward: 11.0900
Episode 2301, loss: 17.9954, total reward: 0.8200
Episode 2302, loss: -21.4521, total rewar

Episode 2445, loss: 548.7073, total reward: 10.6400
Episode 2446, loss: 13.1757, total reward: 0.5000
Episode 2447, loss: 464.0812, total reward: 10.7800
Episode 2448, loss: 413.9391, total reward: 10.3100
Episode 2449, loss: 426.3899, total reward: 10.6100
Episode 2450, loss: 502.0689, total reward: 10.5600
Episode 2451, loss: 16.7950, total reward: 0.6100
Episode 2452, loss: 2.8124, total reward: 0.5800
Episode 2453, loss: 373.4306, total reward: 10.1700
Episode 2454, loss: 29.8255, total reward: 1.0200
Episode 2455, loss: 20.4438, total reward: 0.8100
Episode 2456, loss: 369.4114, total reward: 10.0600
Episode 2457, loss: 5.3037, total reward: 0.4000
Episode 2458, loss: 22.6318, total reward: 0.7000
Episode 2459, loss: 17.6313, total reward: 0.8100
Episode 2460, loss: 13.3380, total reward: 0.7100
Episode 2461, loss: -37.8267, total reward: -0.3900
Episode 2462, loss: 14.8160, total reward: 0.7100
Episode 2463, loss: 20.9774, total reward: 0.9100
Episode 2464, loss: 6.7631, total re

Episode 2607, loss: 389.5705, total reward: 10.6200
Episode 2608, loss: -3.4828, total reward: 0.3000
Episode 2609, loss: 416.4588, total reward: 10.7700
Episode 2610, loss: 498.0789, total reward: 10.9700
Episode 2611, loss: 399.9868, total reward: 10.7700
Episode 2612, loss: 13.1115, total reward: 0.5000
Episode 2613, loss: 439.5825, total reward: 10.9400
Episode 2614, loss: 38.4427, total reward: 1.4300
Episode 2615, loss: 506.5558, total reward: 10.4700
Episode 2616, loss: 528.7701, total reward: 10.3900
Episode 2617, loss: 311.9646, total reward: 10.4900
Episode 2618, loss: 393.3316, total reward: 10.8000
Episode 2619, loss: 29.4817, total reward: 0.9600
Episode 2620, loss: 486.6043, total reward: 10.9700
Episode 2621, loss: -2.1261, total reward: 0.0900
Episode 2622, loss: 504.0191, total reward: 10.7200
Episode 2623, loss: 18.3019, total reward: 0.5000
Episode 2624, loss: 26.3804, total reward: 1.3200
Episode 2625, loss: 456.7261, total reward: 10.7000
Episode 2626, loss: 540.34

Episode 2768, loss: 11.7015, total reward: 0.8100
Episode 2769, loss: 467.5552, total reward: 10.4300
Episode 2770, loss: 27.5104, total reward: 1.0200
Episode 2771, loss: 25.5743, total reward: 0.8100
Episode 2772, loss: 13.7809, total reward: 0.6100
Episode 2773, loss: 2.2099, total reward: 0.1500
Episode 2774, loss: 479.6068, total reward: 10.4700
Episode 2775, loss: 4.7436, total reward: 0.4000
Episode 2776, loss: 21.9613, total reward: 0.8200
Episode 2777, loss: 8.9815, total reward: 0.5400
Episode 2778, loss: -7.0539, total reward: -0.0100
Episode 2779, loss: 20.4134, total reward: 0.9100
Episode 2780, loss: 388.8250, total reward: 10.5300
Episode 2781, loss: 436.2858, total reward: 10.7600
Episode 2782, loss: 18.9790, total reward: 0.7100
Episode 2783, loss: 16.3812, total reward: 0.7100
Episode 2784, loss: 1.0670, total reward: 0.3000
Episode 2785, loss: 14.4825, total reward: 0.6100
Episode 2786, loss: 434.5109, total reward: 10.6300
Episode 2787, loss: -2.8210, total reward: 

Episode 2931, loss: 27.5556, total reward: 0.9100
Episode 2932, loss: 4.9087, total reward: 0.4000
Episode 2933, loss: -15.6265, total reward: -0.3100
Episode 2934, loss: 31.2354, total reward: 1.3200
Episode 2935, loss: 0.8849, total reward: 0.2000
Episode 2936, loss: -0.7230, total reward: 0.5000
Episode 2937, loss: 467.0063, total reward: 10.9000
Episode 2938, loss: 7.2714, total reward: 0.5000
Episode 2939, loss: 20.3462, total reward: 0.8000
Episode 2940, loss: -4.0071, total reward: 0.0900
Episode 2941, loss: 5.3819, total reward: 0.5800
Episode 2942, loss: 2.0680, total reward: 0.4000
Episode 2943, loss: 5.8148, total reward: 0.3400
Episode 2944, loss: 13.7760, total reward: 0.7100
Episode 2945, loss: 17.6942, total reward: 1.0200
Episode 2946, loss: 17.0822, total reward: 0.7100
Episode 2947, loss: 16.8368, total reward: 0.7900
Episode 2948, loss: 11.2882, total reward: 0.6100
Episode 2949, loss: 26.8179, total reward: 1.0200
Episode 2950, loss: 18.7043, total reward: 0.7800
Ep

Episode 3091, loss: 28.8183, total reward: 1.1200
Episode 3092, loss: 20.7201, total reward: 0.9100
Episode 3093, loss: 434.9932, total reward: 10.7000
Episode 3094, loss: 473.8149, total reward: 10.2000
Episode 3095, loss: 13.2536, total reward: 0.5000
Episode 3096, loss: 19.1469, total reward: 0.7100
Episode 3097, loss: 423.2624, total reward: 10.0500
Episode 3098, loss: 1.2701, total reward: 0.3000
Episode 3099, loss: 502.8227, total reward: 10.4700
Episode 3100, loss: 17.3558, total reward: 0.7100
Episode 3101, loss: 428.4019, total reward: 10.4900
Episode 3102, loss: 7.1563, total reward: 0.5000
Episode 3103, loss: 505.1360, total reward: 10.3000
Episode 3104, loss: 34.2206, total reward: 1.1200
Episode 3105, loss: 11.6099, total reward: 0.4000
Episode 3106, loss: 8.5634, total reward: 0.6100
Episode 3107, loss: 289.0178, total reward: 10.6800
Episode 3108, loss: 462.9630, total reward: 10.6700
Episode 3109, loss: 16.8000, total reward: 0.6000
Episode 3110, loss: 432.0688, total r

Episode 3254, loss: 300.5802, total reward: 10.4600
Episode 3255, loss: 7.9069, total reward: 0.4000
Episode 3256, loss: 272.0983, total reward: 10.4000
Episode 3257, loss: 37.1959, total reward: 1.3200
Episode 3258, loss: 5.4693, total reward: 0.2000
Episode 3259, loss: 8.0967, total reward: 0.3000
Episode 3260, loss: 344.5338, total reward: 10.7600
Episode 3261, loss: 12.6975, total reward: 0.6100
Episode 3262, loss: 27.3655, total reward: 1.0200
Episode 3263, loss: 12.2529, total reward: 0.6100
Episode 3264, loss: 13.8450, total reward: 0.7100
Episode 3265, loss: 1.7623, total reward: 0.1700
Episode 3266, loss: 382.1815, total reward: 11.0100
Episode 3267, loss: 485.4964, total reward: 10.2500
Episode 3268, loss: 8.7094, total reward: 0.3000
Episode 3269, loss: 29.8281, total reward: 1.0200
Episode 3270, loss: 15.1624, total reward: 0.5000
Episode 3271, loss: 529.4842, total reward: 10.6900
Episode 3272, loss: 343.7894, total reward: 10.7700
Episode 3273, loss: 493.0847, total rewar

Episode 3417, loss: 397.0450, total reward: 10.6200
Episode 3418, loss: 15.5473, total reward: 0.3000
Episode 3419, loss: 20.7024, total reward: 0.6700
Episode 3420, loss: 38.2193, total reward: 0.8200
Episode 3421, loss: 9.6070, total reward: 0.5000
Episode 3422, loss: 25.6152, total reward: 0.7100
Episode 3423, loss: 7.7478, total reward: 0.4000
Episode 3424, loss: 662.0132, total reward: 9.9500
Episode 3425, loss: 658.1498, total reward: 10.3400
Episode 3426, loss: 20.6453, total reward: 0.6100
Episode 3427, loss: 0.3771, total reward: 0.6100
Episode 3428, loss: 17.8061, total reward: 0.7100
Episode 3429, loss: 10.1285, total reward: 0.6100
Episode 3430, loss: -7.2109, total reward: 0.3000
Episode 3431, loss: 3.5698, total reward: 0.4000
Episode 3432, loss: 597.7056, total reward: 11.0200
Episode 3433, loss: -3.0587, total reward: 0.0300
Episode 3434, loss: 17.6097, total reward: 0.3400
Episode 3435, loss: 582.6671, total reward: 10.8300
Episode 3436, loss: 421.3841, total reward: 1

Episode 3579, loss: 559.8599, total reward: 11.0900
Episode 3580, loss: 27.8415, total reward: 1.2200
Episode 3581, loss: 400.0879, total reward: 10.6500
Episode 3582, loss: 476.4767, total reward: 11.4600
Episode 3583, loss: 274.9175, total reward: 10.5100
Episode 3584, loss: 459.1164, total reward: 10.4600
Episode 3585, loss: 31.0656, total reward: 1.1200
Episode 3586, loss: -0.7909, total reward: 0.0400
Episode 3587, loss: 356.9447, total reward: 10.4600
Episode 3588, loss: 289.1660, total reward: 10.6200
Episode 3589, loss: 28.5157, total reward: 1.0200
Episode 3590, loss: -16.6992, total reward: -0.5000
Episode 3591, loss: 483.6058, total reward: 10.4700
Episode 3592, loss: 468.3132, total reward: 10.4200
Episode 3593, loss: 443.3046, total reward: 10.9400
Episode 3594, loss: 27.5477, total reward: 1.3200
Episode 3595, loss: -0.1546, total reward: 0.2000
Episode 3596, loss: 22.1500, total reward: 0.9200
Episode 3597, loss: 539.6580, total reward: 11.0300
Episode 3598, loss: 437.32

Episode 3741, loss: 11.9829, total reward: 0.5000
Episode 3742, loss: 20.0002, total reward: 0.9100
Episode 3743, loss: 23.6620, total reward: 0.6100
Episode 3744, loss: 17.4482, total reward: 1.1200
Episode 3745, loss: 2.1795, total reward: 0.3000
Episode 3746, loss: -8.3544, total reward: 0.3000
Episode 3747, loss: 428.0380, total reward: 10.1900
Episode 3748, loss: 436.8572, total reward: 10.7500
Episode 3749, loss: 5.1019, total reward: 0.2000
Episode 3750, loss: 20.3153, total reward: 0.8100
Episode 3751, loss: 9.8970, total reward: 0.3000
Episode 3752, loss: 6.7434, total reward: 0.7100
Episode 3753, loss: 275.0763, total reward: 10.5200
Episode 3754, loss: 5.9134, total reward: 0.3000
Episode 3755, loss: 441.7685, total reward: 10.9500
Episode 3756, loss: 13.2315, total reward: 0.8800
Episode 3757, loss: 22.8823, total reward: 0.7100
Episode 3758, loss: -5.9922, total reward: -0.0100
Episode 3759, loss: 512.4402, total reward: 10.3500
Episode 3760, loss: 18.5242, total reward: 1

Episode 3903, loss: 265.7253, total reward: 10.4200
Episode 3904, loss: 8.3921, total reward: 0.4000
Episode 3905, loss: 470.6439, total reward: 10.5100
Episode 3906, loss: 496.0209, total reward: 11.4400
Episode 3907, loss: 462.8291, total reward: 11.1300
Episode 3908, loss: 28.6806, total reward: 1.1200
Episode 3909, loss: 7.3821, total reward: 0.7100
Episode 3910, loss: 543.5219, total reward: 10.6400
Episode 3911, loss: 9.2607, total reward: 0.5000
Episode 3912, loss: 340.0934, total reward: 10.5400
Episode 3913, loss: 445.3577, total reward: 10.6700
Episode 3914, loss: 363.5053, total reward: 10.2200
Episode 3915, loss: -4.5258, total reward: 0.3000
Episode 3916, loss: 7.0056, total reward: 0.6800
Episode 3917, loss: 30.2286, total reward: 1.0200
Episode 3918, loss: 488.1907, total reward: 10.7400
Episode 3919, loss: 502.1655, total reward: 11.3400
Episode 3920, loss: 350.8698, total reward: 10.9500
Episode 3921, loss: 10.7262, total reward: 0.6100
Episode 3922, loss: 452.1872, to

Episode 4062, loss: 0.8074, total reward: 0.4000
Episode 4063, loss: 532.6288, total reward: 10.5300
Episode 4064, loss: 17.0546, total reward: 0.7100
Episode 4065, loss: 27.9387, total reward: 0.8600
Episode 4066, loss: 30.5201, total reward: 1.1200
Episode 4067, loss: 10.2685, total reward: 0.6100
Episode 4068, loss: 20.3057, total reward: 0.8100
Episode 4069, loss: 33.6181, total reward: 1.3200
Episode 4070, loss: 12.0610, total reward: 0.5000
Episode 4071, loss: 516.1060, total reward: 10.3900
Episode 4072, loss: 39.6341, total reward: 1.5300
Episode 4073, loss: 492.7663, total reward: 10.6000
Episode 4074, loss: 356.6655, total reward: 11.2400
Episode 4075, loss: 517.5443, total reward: 11.0600
Episode 4076, loss: 30.7294, total reward: 1.1200
Episode 4077, loss: 18.5763, total reward: 0.8100
Episode 4078, loss: 449.2263, total reward: 10.4500
Episode 4079, loss: 512.9849, total reward: 10.3500
Episode 4080, loss: 448.5081, total reward: 10.6200
Episode 4081, loss: 357.3340, total

Episode 4224, loss: 585.3347, total reward: 11.1400
Episode 4225, loss: 37.7708, total reward: 0.9100
Episode 4226, loss: 13.4968, total reward: 0.7100
Episode 4227, loss: 585.1719, total reward: 10.8400
Episode 4228, loss: 511.1462, total reward: 10.3900
Episode 4229, loss: 34.8013, total reward: 1.2200
Episode 4230, loss: 17.0091, total reward: 0.7100
Episode 4231, loss: 462.8297, total reward: 10.6200
Episode 4232, loss: 1.9645, total reward: 0.2000
Episode 4233, loss: -0.8981, total reward: 0.2500
Episode 4234, loss: 21.3607, total reward: 0.8400
Episode 4235, loss: 386.6273, total reward: 10.5100
Episode 4236, loss: 8.7439, total reward: 0.8100
Episode 4237, loss: 542.7317, total reward: 10.9700
Episode 4238, loss: 24.8435, total reward: 0.9100
Episode 4239, loss: 438.9559, total reward: 10.7900
Episode 4240, loss: 443.0273, total reward: 10.1000
Episode 4241, loss: 400.1855, total reward: 10.5400
Episode 4242, loss: -9.5169, total reward: 0.0900
Episode 4243, loss: 590.7303, tota

Episode 4386, loss: 496.9979, total reward: 10.5200
Episode 4387, loss: 471.0236, total reward: 11.3000
Episode 4388, loss: -4.9401, total reward: 0.1200
Episode 4389, loss: 501.1861, total reward: 10.7100
Episode 4390, loss: 374.9508, total reward: 10.1800
Episode 4391, loss: 454.3419, total reward: 10.5400
Episode 4392, loss: 7.2869, total reward: 0.6100
Episode 4393, loss: 3.1404, total reward: 0.3000
Episode 4394, loss: 8.0087, total reward: 0.3000
Episode 4395, loss: 17.9161, total reward: 0.7100
Episode 4396, loss: -9.7785, total reward: -0.0200
Episode 4397, loss: 1.3238, total reward: 0.3000
Episode 4398, loss: 10.6723, total reward: 0.4000
Episode 4399, loss: 7.8038, total reward: 0.3000
Episode 4400, loss: 9.5743, total reward: 0.6100
Episode 4401, loss: 466.5218, total reward: 10.9500
Episode 4402, loss: 15.3364, total reward: 0.6600
Episode 4403, loss: 6.9291, total reward: 0.3000
Episode 4404, loss: 18.6919, total reward: 0.7100
Episode 4405, loss: 484.8786, total reward: 

Episode 4548, loss: 497.9284, total reward: 11.0400
Episode 4549, loss: 17.6706, total reward: 0.6100
Episode 4550, loss: 20.5441, total reward: 0.8000
Episode 4551, loss: 18.3086, total reward: 1.1200
Episode 4552, loss: 476.3589, total reward: 10.8600
Episode 4553, loss: 392.3083, total reward: 9.9100
Episode 4554, loss: 3.6648, total reward: 0.4000
Episode 4555, loss: -1.2870, total reward: 0.3000
Episode 4556, loss: 517.4930, total reward: 10.4600
Episode 4557, loss: 34.3393, total reward: 1.1200
Episode 4558, loss: 4.5941, total reward: 0.4000
Episode 4559, loss: 376.0137, total reward: 10.4900
Episode 4560, loss: 485.6198, total reward: 10.4400
Episode 4561, loss: 431.5811, total reward: 10.4300
Episode 4562, loss: -1.8868, total reward: 0.4000
Episode 4563, loss: 38.5807, total reward: 1.3200
Episode 4564, loss: 2.8054, total reward: 0.5000
Episode 4565, loss: 12.0679, total reward: 0.8100
Episode 4566, loss: 431.1245, total reward: 10.4900
Episode 4567, loss: -1.5609, total rew

Episode 4709, loss: 9.9499, total reward: 0.4000
Episode 4710, loss: 473.0465, total reward: 10.6800
Episode 4711, loss: 505.2024, total reward: 10.9700
Episode 4712, loss: 503.3110, total reward: 10.9200
Episode 4713, loss: 321.3040, total reward: 10.2700
Episode 4714, loss: 8.5397, total reward: 0.6100
Episode 4715, loss: 366.9935, total reward: 11.1200
Episode 4716, loss: 29.9843, total reward: 1.1200
Episode 4717, loss: 531.3974, total reward: 11.0600
Episode 4718, loss: 16.7196, total reward: 0.9100
Episode 4719, loss: 24.5500, total reward: 0.9700
Episode 4720, loss: 7.6579, total reward: 0.1800
Episode 4721, loss: 484.3242, total reward: 11.1100
Episode 4722, loss: 33.2831, total reward: 1.0200
Episode 4723, loss: 18.5438, total reward: 1.0200
Episode 4724, loss: 482.5650, total reward: 10.5600
Episode 4725, loss: 28.9805, total reward: 1.2900
Episode 4726, loss: 442.4203, total reward: 10.6500
Episode 4727, loss: 28.5134, total reward: 1.1200
Episode 4728, loss: 292.8797, total

Episode 4871, loss: 417.5939, total reward: 10.4000
Episode 4872, loss: 339.2785, total reward: 10.1900
Episode 4873, loss: -1.7869, total reward: -0.0600
Episode 4874, loss: 5.3535, total reward: 0.3000
Episode 4875, loss: 18.0391, total reward: 0.9100
Episode 4876, loss: 7.0987, total reward: 0.3000
Episode 4877, loss: 21.8477, total reward: 0.7100
Episode 4878, loss: 336.4945, total reward: 10.7500
Episode 4879, loss: 26.6864, total reward: 1.0200
Episode 4880, loss: 457.3881, total reward: 10.5600
Episode 4881, loss: 11.6256, total reward: 0.6100
Episode 4882, loss: 4.5644, total reward: 0.2000
Episode 4883, loss: 427.9793, total reward: 10.7300
Episode 4884, loss: 410.6623, total reward: 10.5000
Episode 4885, loss: -13.6407, total reward: -0.1500
Episode 4886, loss: 3.2523, total reward: 0.4000
Episode 4887, loss: 9.4899, total reward: 0.9100
Episode 4888, loss: 436.6411, total reward: 10.3600
Episode 4889, loss: 21.7835, total reward: 1.0200
Episode 4890, loss: 11.2511, total rew

Episode 5030, loss: 27.6309, total reward: 1.0200
Episode 5031, loss: 11.7553, total reward: 0.7100
Episode 5032, loss: 1.7188, total reward: 0.3000
Episode 5033, loss: 273.7173, total reward: 10.5100
Episode 5034, loss: 2.0352, total reward: 0.3000
Episode 5035, loss: 451.8881, total reward: 10.2800
Episode 5036, loss: 6.5884, total reward: 0.5000
Episode 5037, loss: 8.6152, total reward: 0.4000
Episode 5038, loss: 17.1886, total reward: 0.7100
Episode 5039, loss: 29.4730, total reward: 1.1200
Episode 5040, loss: 5.6098, total reward: 0.7100
Episode 5041, loss: 419.0297, total reward: 10.3700
Episode 5042, loss: 5.5866, total reward: 0.4000
Episode 5043, loss: 10.1428, total reward: 0.7100
Episode 5044, loss: 357.7568, total reward: 10.5400
Episode 5045, loss: 473.7633, total reward: 10.6500
Episode 5046, loss: 295.9516, total reward: 10.5000
Episode 5047, loss: 23.8756, total reward: 0.8100
Episode 5048, loss: 212.7175, total reward: 10.3800
Episode 5049, loss: 494.7844, total reward

Episode 5192, loss: 25.6264, total reward: 1.1200
Episode 5193, loss: 284.3680, total reward: 10.4900
Episode 5194, loss: 28.0136, total reward: 1.1200
Episode 5195, loss: 20.2700, total reward: 0.9100
Episode 5196, loss: 546.9194, total reward: 10.5000
Episode 5197, loss: 507.5603, total reward: 10.8900
Episode 5198, loss: 11.6782, total reward: 0.6100
Episode 5199, loss: 475.1486, total reward: 10.7800
Episode 5200, loss: 30.2270, total reward: 1.1200
Episode 5201, loss: 14.1342, total reward: 0.7100
Episode 5202, loss: -3.6116, total reward: 0.3000
Episode 5203, loss: 8.6228, total reward: 0.5000
Episode 5204, loss: 546.9011, total reward: 10.8400
Episode 5205, loss: 6.9848, total reward: 0.6100
Episode 5206, loss: 569.9246, total reward: 10.3400
Episode 5207, loss: -0.0847, total reward: 0.3000
Episode 5208, loss: 25.1053, total reward: 1.1200
Episode 5209, loss: 28.7286, total reward: 1.0200
Episode 5210, loss: 2.0832, total reward: 0.4000
Episode 5211, loss: 0.1469, total reward:

Episode 5355, loss: 444.4166, total reward: 10.8300
Episode 5356, loss: 524.2040, total reward: 10.3100
Episode 5357, loss: 5.7992, total reward: 0.4100
Episode 5358, loss: -2.4637, total reward: 0.3000
Episode 5359, loss: 13.5277, total reward: 0.7300
Episode 5360, loss: 380.2889, total reward: 10.2200
Episode 5361, loss: 501.1335, total reward: 11.0700
Episode 5362, loss: -8.1939, total reward: 0.0600
Episode 5363, loss: 17.9973, total reward: 0.9100
Episode 5364, loss: 19.0189, total reward: 0.6100
Episode 5365, loss: 465.6303, total reward: 10.3200
Episode 5366, loss: 19.1437, total reward: 0.8100
Episode 5367, loss: 355.1174, total reward: 10.1400
Episode 5368, loss: 15.6182, total reward: 0.7100
Episode 5369, loss: 351.6204, total reward: 10.9500
Episode 5370, loss: 521.0527, total reward: 11.0400
Episode 5371, loss: 25.7283, total reward: 1.0200
Episode 5372, loss: 333.0594, total reward: 10.4700
Episode 5373, loss: 10.5158, total reward: 0.6100
Episode 5374, loss: 478.6178, tot

Episode 5517, loss: 6.0303, total reward: 0.5000
Episode 5518, loss: 8.0103, total reward: 0.2000
Episode 5519, loss: 27.1991, total reward: 0.8100
Episode 5520, loss: -13.2621, total reward: -0.3200
Episode 5521, loss: 10.1981, total reward: 0.6100
Episode 5522, loss: 5.3266, total reward: 0.4000
Episode 5523, loss: 17.2259, total reward: 0.6100
Episode 5524, loss: 4.8860, total reward: 0.3000
Episode 5525, loss: 400.2965, total reward: 10.2000
Episode 5526, loss: 5.4094, total reward: 0.2000
Episode 5527, loss: -36.2956, total reward: -0.5300
Episode 5528, loss: 8.2103, total reward: 0.3000
Episode 5529, loss: 369.1623, total reward: 9.9700
Episode 5530, loss: 376.6547, total reward: 10.5200
Episode 5531, loss: 2.4357, total reward: 0.2700
Episode 5532, loss: 14.3308, total reward: 0.9100
Episode 5533, loss: 493.4334, total reward: 10.6000
Episode 5534, loss: -12.1846, total reward: -0.1300
Episode 5535, loss: 3.3903, total reward: 0.3000
Episode 5536, loss: 292.5331, total reward: 9

Episode 5679, loss: 480.3232, total reward: 10.5300
Episode 5680, loss: 490.2000, total reward: 10.7300
Episode 5681, loss: 361.3458, total reward: 10.7200
Episode 5682, loss: 515.0471, total reward: 11.2100
Episode 5683, loss: 524.0227, total reward: 10.5200
Episode 5684, loss: 535.7862, total reward: 10.9600
Episode 5685, loss: 490.6200, total reward: 10.6200
Episode 5686, loss: -11.0435, total reward: 0.1600
Episode 5687, loss: -2.8556, total reward: 0.0700
Episode 5688, loss: 442.5643, total reward: 10.1700
Episode 5689, loss: 12.5365, total reward: 0.5000
Episode 5690, loss: 565.7883, total reward: 10.3100
Episode 5691, loss: 526.2252, total reward: 10.5000
Episode 5692, loss: 15.7942, total reward: 0.8000
Episode 5693, loss: 330.2782, total reward: 10.6700
Episode 5694, loss: 321.1094, total reward: 10.7100
Episode 5695, loss: 420.6284, total reward: 10.9200
Episode 5696, loss: 36.6188, total reward: 1.4300
Episode 5697, loss: 425.7189, total reward: 10.9100
Episode 5698, loss: 4

Episode 5840, loss: 35.2405, total reward: 1.3200
Episode 5841, loss: 12.1561, total reward: 0.8100
Episode 5842, loss: 458.8957, total reward: 10.5900
Episode 5843, loss: 449.8443, total reward: 11.0200
Episode 5844, loss: 531.8431, total reward: 11.2000
Episode 5845, loss: -7.5948, total reward: -0.9200
Episode 5846, loss: 467.2527, total reward: 10.3800
Episode 5847, loss: 1.7038, total reward: 0.3100
Episode 5848, loss: 386.0350, total reward: 10.6900
Episode 5849, loss: 474.2354, total reward: 9.9700
Episode 5850, loss: -20.2047, total reward: -0.1200
Episode 5851, loss: 519.8279, total reward: 10.6200
Episode 5852, loss: 507.1036, total reward: 11.2300
Episode 5853, loss: 305.5536, total reward: 10.4900
Episode 5854, loss: 21.4090, total reward: 1.1000
Episode 5855, loss: 16.3893, total reward: 0.9100
Episode 5856, loss: 7.7419, total reward: 0.6500
Episode 5857, loss: 377.6950, total reward: 10.6800
Episode 5858, loss: 440.8074, total reward: 10.1600
Episode 5859, loss: 513.8326

Episode 6000, loss: 309.9715, total reward: 10.4700
Episode 6001, loss: 8.2731, total reward: 0.5000
Episode 6002, loss: -8.1837, total reward: 0.0900
Episode 6003, loss: -1.2013, total reward: -0.0000
Episode 6004, loss: 7.4215, total reward: 0.7800
Episode 6005, loss: 8.4503, total reward: 0.6100
Episode 6006, loss: 421.0553, total reward: 10.1800
Episode 6007, loss: 15.3180, total reward: 0.6400
Episode 6008, loss: 401.4290, total reward: 10.2600
Episode 6009, loss: 26.3806, total reward: 1.1200
Episode 6010, loss: 460.0020, total reward: 10.4100
Episode 6011, loss: 359.1476, total reward: 11.2400
Episode 6012, loss: 445.3334, total reward: 10.7500
Episode 6013, loss: 451.7928, total reward: 10.8900
Episode 6014, loss: 15.1347, total reward: 0.4000
Episode 6015, loss: 29.9188, total reward: 1.1200
Episode 6016, loss: 29.7383, total reward: 1.0800
Episode 6017, loss: 17.5537, total reward: 1.0200
Episode 6018, loss: 28.9740, total reward: 1.2200
Episode 6019, loss: 27.5361, total rew

Episode 6162, loss: 22.9943, total reward: 0.8400
Episode 6163, loss: 475.7440, total reward: 10.7500
Episode 6164, loss: 400.6502, total reward: 10.5900
Episode 6165, loss: 497.3976, total reward: 10.7700
Episode 6166, loss: 540.9766, total reward: 10.4600
Episode 6167, loss: 8.1749, total reward: 0.6100
Episode 6168, loss: 495.7202, total reward: 10.8500
Episode 6169, loss: 402.6975, total reward: 10.3200
Episode 6170, loss: 449.2144, total reward: 11.0000
Episode 6171, loss: 3.4235, total reward: 0.4000
Episode 6172, loss: 462.6757, total reward: 10.3100
Episode 6173, loss: 432.4046, total reward: 10.8900
Episode 6174, loss: 26.1785, total reward: 1.0200
Episode 6175, loss: 511.2021, total reward: 10.8700
Episode 6176, loss: -3.2903, total reward: 0.2000
Episode 6177, loss: 27.6710, total reward: 0.6900
Episode 6178, loss: 3.1120, total reward: 0.1000
Episode 6179, loss: 480.8599, total reward: 10.8500
Episode 6180, loss: 436.7139, total reward: 10.4200
Episode 6181, loss: 518.6154,

Episode 6323, loss: 366.6848, total reward: 10.3600
Episode 6324, loss: 9.0154, total reward: 0.8100
Episode 6325, loss: 422.7582, total reward: 10.9300
Episode 6326, loss: 20.9116, total reward: 0.8100
Episode 6327, loss: 393.2744, total reward: 10.6800
Episode 6328, loss: 412.3310, total reward: 10.5200
Episode 6329, loss: 405.4269, total reward: 10.7600
Episode 6330, loss: 471.4120, total reward: 10.8200
Episode 6331, loss: 485.6783, total reward: 10.2900
Episode 6332, loss: 7.9947, total reward: 0.5000
Episode 6333, loss: 30.4849, total reward: 1.4300
Episode 6334, loss: 390.5097, total reward: 10.4100
Episode 6335, loss: 7.3058, total reward: 0.6100
Episode 6336, loss: 5.0749, total reward: 0.4000
Episode 6337, loss: 457.0896, total reward: 10.6900
Episode 6338, loss: 20.7929, total reward: 1.0200
Episode 6339, loss: 17.1708, total reward: 0.6100
Episode 6340, loss: 394.8373, total reward: 10.1300
Episode 6341, loss: 477.9189, total reward: 10.4600
Episode 6342, loss: 475.0057, to

Episode 6484, loss: 34.4453, total reward: 1.2200
Episode 6485, loss: 444.8752, total reward: 10.7500
Episode 6486, loss: 9.8557, total reward: 0.7100
Episode 6487, loss: 34.7203, total reward: 1.6300
Episode 6488, loss: 413.6505, total reward: 10.2000
Episode 6489, loss: 463.1225, total reward: 10.4900
Episode 6490, loss: 15.0353, total reward: 0.8100
Episode 6491, loss: 26.0186, total reward: 0.8100
Episode 6492, loss: 32.8371, total reward: 1.1200
Episode 6493, loss: 507.2659, total reward: 10.6200
Episode 6494, loss: 22.3901, total reward: 0.9100
Episode 6495, loss: 396.8325, total reward: 10.9100
Episode 6496, loss: 15.4143, total reward: 0.9100
Episode 6497, loss: 8.6355, total reward: 1.0200
Episode 6498, loss: 492.9344, total reward: 10.7300
Episode 6499, loss: 466.2620, total reward: 11.0900
Episode 6500, loss: 30.5775, total reward: 1.4300
Episode 6501, loss: 7.8290, total reward: 0.4000
Episode 6502, loss: 299.2420, total reward: 10.3900
Episode 6503, loss: 10.1160, total re

Episode 6645, loss: 495.5981, total reward: 11.5900
Episode 6646, loss: 567.1111, total reward: 10.4400
Episode 6647, loss: 24.1402, total reward: 0.9100
Episode 6648, loss: 456.3778, total reward: 10.9900
Episode 6649, loss: 334.5207, total reward: 10.5500
Episode 6650, loss: 10.7866, total reward: 0.8100
Episode 6651, loss: 300.8925, total reward: 9.9600
Episode 6652, loss: 22.2701, total reward: 0.8100
Episode 6653, loss: 388.6577, total reward: 10.9700
Episode 6654, loss: 365.4349, total reward: 10.7200
Episode 6655, loss: 521.7049, total reward: 10.9400
Episode 6656, loss: 400.1730, total reward: 10.9600
Episode 6657, loss: 31.1523, total reward: 1.3200
Episode 6658, loss: 445.3248, total reward: 10.6600
Episode 6659, loss: 453.7401, total reward: 10.6900
Episode 6660, loss: 29.4400, total reward: 1.3200
Episode 6661, loss: 25.0160, total reward: 1.0200
Episode 6662, loss: 22.1784, total reward: 0.9100
Episode 6663, loss: 413.1061, total reward: 10.5600
Episode 6664, loss: -3.2562

Episode 6807, loss: 11.4210, total reward: 0.6100
Episode 6808, loss: 21.2686, total reward: 0.7500
Episode 6809, loss: 372.2980, total reward: 10.4100
Episode 6810, loss: 15.5879, total reward: 0.6500
Episode 6811, loss: 9.6739, total reward: 0.4000
Episode 6812, loss: 22.0712, total reward: 1.0200
Episode 6813, loss: 34.1191, total reward: 1.4300
Episode 6814, loss: 421.2258, total reward: 10.6900
Episode 6815, loss: 9.4024, total reward: 0.4200
Episode 6816, loss: 383.2122, total reward: 10.5800
Episode 6817, loss: 525.4831, total reward: 10.6800
Episode 6818, loss: 362.7377, total reward: 10.6800
Episode 6819, loss: 17.4603, total reward: 0.9100
Episode 6820, loss: 2.3301, total reward: 0.5000
Episode 6821, loss: 13.1843, total reward: 0.7100
Episode 6822, loss: 464.1040, total reward: 10.2300
Episode 6823, loss: 1.8620, total reward: 0.3000
Episode 6824, loss: 469.4177, total reward: 10.6100
Episode 6825, loss: 399.6546, total reward: 10.5000
Episode 6826, loss: 429.2999, total re

Episode 6970, loss: 21.7123, total reward: 0.8100
Episode 6971, loss: 449.1548, total reward: 11.2400
Episode 6972, loss: 18.6093, total reward: 0.7100
Episode 6973, loss: 40.8872, total reward: 1.3200
Episode 6974, loss: 438.3781, total reward: 10.6000
Episode 6975, loss: 7.2042, total reward: 0.4000
Episode 6976, loss: 5.5631, total reward: 0.3000
Episode 6977, loss: 492.7589, total reward: 10.8700
Episode 6978, loss: 0.5947, total reward: 0.3200
Episode 6979, loss: -1.4819, total reward: 0.1200
Episode 6980, loss: 7.0662, total reward: 0.6100
Episode 6981, loss: 7.1175, total reward: 0.6100
Episode 6982, loss: -2.1938, total reward: 0.2000
Episode 6983, loss: -3.3339, total reward: 0.0400
Episode 6984, loss: 13.1671, total reward: 0.8100
Episode 6985, loss: 344.0905, total reward: 10.3700
Episode 6986, loss: 36.5745, total reward: 1.4300
Episode 6987, loss: 10.8795, total reward: 0.6100
Episode 6988, loss: 188.2787, total reward: 10.2200
Episode 6989, loss: 6.2907, total reward: 0.4

Episode 7131, loss: 15.5754, total reward: 0.5800
Episode 7132, loss: 512.2665, total reward: 10.5700
Episode 7133, loss: 13.9077, total reward: 0.3000
Episode 7134, loss: 17.2141, total reward: 0.9100
Episode 7135, loss: 21.9246, total reward: 0.8100
Episode 7136, loss: 20.8420, total reward: 1.0200
Episode 7137, loss: 8.0590, total reward: 0.6100
Episode 7138, loss: 10.1556, total reward: 0.4000
Episode 7139, loss: 24.0334, total reward: 1.0000
Episode 7140, loss: 10.2001, total reward: 0.7100
Episode 7141, loss: 216.6592, total reward: 10.1700
Episode 7142, loss: 486.6580, total reward: 10.7100
Episode 7143, loss: 21.7727, total reward: 0.6100
Episode 7144, loss: 17.9253, total reward: 0.9100
Episode 7145, loss: 320.0338, total reward: 9.9400
Episode 7146, loss: 1.8529, total reward: 0.5000
Episode 7147, loss: 381.6085, total reward: 10.2800
Episode 7148, loss: 428.6165, total reward: 11.4700
Episode 7149, loss: 15.2933, total reward: 0.8100
Episode 7150, loss: 24.8602, total reward

Episode 7294, loss: 392.0667, total reward: 10.9200
Episode 7295, loss: 388.9366, total reward: 10.3000
Episode 7296, loss: 8.3899, total reward: 0.3000
Episode 7297, loss: 420.0296, total reward: 10.3400
Episode 7298, loss: 13.7441, total reward: 0.5000
Episode 7299, loss: 19.9093, total reward: 0.8100
Episode 7300, loss: 296.9108, total reward: 10.4700
Episode 7301, loss: 14.7707, total reward: 0.7100
Episode 7302, loss: 25.1250, total reward: 0.9100
Episode 7303, loss: 7.5332, total reward: 0.3000
Episode 7304, loss: 11.0043, total reward: 0.6100
Episode 7305, loss: 9.3127, total reward: 0.6100
Episode 7306, loss: 8.9973, total reward: 0.5000
Episode 7307, loss: 8.3106, total reward: 0.5000
Episode 7308, loss: 373.9878, total reward: 10.2300
Episode 7309, loss: 23.2961, total reward: 0.8200
Episode 7310, loss: 3.1838, total reward: 0.2000
Episode 7311, loss: 21.4486, total reward: 1.0200
Episode 7312, loss: 11.8754, total reward: 0.3000
Episode 7313, loss: 27.0185, total reward: 0.9

Episode 7457, loss: 12.2448, total reward: 0.5800
Episode 7458, loss: 248.9795, total reward: 10.4500
Episode 7459, loss: 15.6731, total reward: 0.6100
Episode 7460, loss: 7.4482, total reward: 0.3000
Episode 7461, loss: 1.1670, total reward: 0.1300
Episode 7462, loss: 9.6980, total reward: 0.4000
Episode 7463, loss: 28.6856, total reward: 1.1200
Episode 7464, loss: 20.0472, total reward: 0.7100
Episode 7465, loss: 8.1356, total reward: 0.3000
Episode 7466, loss: 221.8976, total reward: 10.9700
Episode 7467, loss: 15.8547, total reward: 0.9100
Episode 7468, loss: 5.8622, total reward: 0.4000
Episode 7469, loss: 385.1317, total reward: 10.3300
Episode 7470, loss: 340.1133, total reward: 10.2200
Episode 7471, loss: 342.9001, total reward: 10.7300
Episode 7472, loss: 523.8492, total reward: 10.1500
Episode 7473, loss: 4.1381, total reward: 0.5000
Episode 7474, loss: 430.5395, total reward: 10.3100
Episode 7475, loss: 5.9847, total reward: 0.3000
Episode 7476, loss: 362.2344, total reward:

Episode 7620, loss: 4.3121, total reward: -0.0100
Episode 7621, loss: -0.4091, total reward: 0.0900
Episode 7622, loss: 4.0207, total reward: 0.4000
Episode 7623, loss: 4.6095, total reward: 0.2000
Episode 7624, loss: 2.2854, total reward: 0.1000
Episode 7625, loss: 271.4564, total reward: 10.2700
Episode 7626, loss: 5.1969, total reward: 0.4800
Episode 7627, loss: 2.5075, total reward: 0.0900
Episode 7628, loss: 3.4094, total reward: 0.3000
Episode 7629, loss: 3.9616, total reward: 0.0900
Episode 7630, loss: 5.8092, total reward: 0.0900
Episode 7631, loss: 5.6772, total reward: 0.4000
Episode 7632, loss: 0.3276, total reward: 0.0900
Episode 7633, loss: 11.9688, total reward: 0.6100
Episode 7634, loss: -4.3941, total reward: -0.3800
Episode 7635, loss: 14.1339, total reward: 0.5000
Episode 7636, loss: 3.1573, total reward: -0.0100
Episode 7637, loss: 8.7310, total reward: 0.5000
Episode 7638, loss: 13.5250, total reward: 0.9100
Episode 7639, loss: 16.0162, total reward: 0.7100
Episode 

Episode 7784, loss: 13.7910, total reward: 0.6100
Episode 7785, loss: 225.7816, total reward: 10.4100
Episode 7786, loss: 7.3942, total reward: 0.4000
Episode 7787, loss: 5.7865, total reward: 0.3000
Episode 7788, loss: -0.4209, total reward: 0.2600
Episode 7789, loss: 11.4433, total reward: 0.5000
Episode 7790, loss: 10.2249, total reward: 0.6100
Episode 7791, loss: 22.9250, total reward: 1.2200
Episode 7792, loss: 19.4462, total reward: 1.1200
Episode 7793, loss: 391.1446, total reward: 10.5200
Episode 7794, loss: 15.7831, total reward: 0.7100
Episode 7795, loss: 345.3772, total reward: 10.6300
Episode 7796, loss: 15.0397, total reward: 0.6100
Episode 7797, loss: 7.9363, total reward: 0.3800
Episode 7798, loss: 2.3501, total reward: 0.1600
Episode 7799, loss: 8.2213, total reward: 0.6100
Episode 7800, loss: 20.0171, total reward: 0.8100
Episode 7801, loss: 9.4952, total reward: 0.3000
Episode 7802, loss: 180.5032, total reward: 10.4200
Episode 7803, loss: 6.8073, total reward: 0.2000

Episode 7949, loss: 5.3457, total reward: 0.2000
Episode 7950, loss: 11.2877, total reward: 0.6100
Episode 7951, loss: -3.9147, total reward: -0.1100
Episode 7952, loss: 12.9687, total reward: 0.8100
Episode 7953, loss: 345.5776, total reward: 10.6500
Episode 7954, loss: 18.6214, total reward: 0.8100
Episode 7955, loss: 345.7709, total reward: 10.3200
Episode 7956, loss: 15.6994, total reward: 0.8100
Episode 7957, loss: 9.7017, total reward: 0.3000
Episode 7958, loss: 4.0235, total reward: 0.4000
Episode 7959, loss: 14.2457, total reward: 0.6100
Episode 7960, loss: 5.0886, total reward: 0.5000
Episode 7961, loss: 12.1035, total reward: 0.5000
Episode 7962, loss: 396.6172, total reward: 10.1000
Episode 7963, loss: 20.0764, total reward: 0.7100
Episode 7964, loss: 8.6385, total reward: 0.5000
Episode 7965, loss: 3.0165, total reward: 0.2000
Episode 7966, loss: -3.4849, total reward: -1.0000
Episode 7967, loss: 17.5351, total reward: 1.0200
Episode 7968, loss: 257.2099, total reward: 10.3

Episode 8111, loss: 5.1687, total reward: 0.4000
Episode 8112, loss: 8.8702, total reward: 0.4000
Episode 8113, loss: 10.8965, total reward: 0.6100
Episode 8114, loss: -3.5494, total reward: -0.1100
Episode 8115, loss: 233.9340, total reward: 10.4000
Episode 8116, loss: 12.7193, total reward: 0.7000
Episode 8117, loss: 4.8113, total reward: 0.4000
Episode 8118, loss: 5.3513, total reward: 0.4000
Episode 8119, loss: 9.0886, total reward: 0.3500
Episode 8120, loss: 10.3474, total reward: 0.5000
Episode 8121, loss: 9.2777, total reward: 0.5000
Episode 8122, loss: 9.1254, total reward: 0.4000
Episode 8123, loss: 222.3010, total reward: 10.3800
Episode 8124, loss: 9.5290, total reward: 0.4000
Episode 8125, loss: 16.4376, total reward: 0.9100
Episode 8126, loss: 13.8465, total reward: 0.5500
Episode 8127, loss: 250.5634, total reward: 10.3500
Episode 8128, loss: 14.8880, total reward: 0.6100
Episode 8129, loss: 13.5704, total reward: 0.8100
Episode 8130, loss: 0.3677, total reward: 0.0900
Ep

Episode 8274, loss: 7.9725, total reward: 0.6100
Episode 8275, loss: 27.0696, total reward: 1.2200
Episode 8276, loss: 205.4267, total reward: 10.5400
Episode 8277, loss: 7.3422, total reward: 0.5000
Episode 8278, loss: 16.4882, total reward: 0.9100
Episode 8279, loss: 12.7664, total reward: 1.0200
Episode 8280, loss: 217.5026, total reward: 10.1400
Episode 8281, loss: 22.7528, total reward: 1.2200
Episode 8282, loss: 5.1765, total reward: 0.3900
Episode 8283, loss: 403.7319, total reward: 9.9700
Episode 8284, loss: 8.7425, total reward: 0.6100
Episode 8285, loss: 2.9788, total reward: 0.5000
Episode 8286, loss: 8.9182, total reward: 0.8100
Episode 8287, loss: 11.2784, total reward: 0.7100
Episode 8288, loss: 13.9676, total reward: 0.8100
Episode 8289, loss: 0.0786, total reward: 0.0600
Episode 8290, loss: 16.3314, total reward: 0.5000
Episode 8291, loss: 2.6349, total reward: 0.2000
Episode 8292, loss: 7.1546, total reward: 0.7000
Episode 8293, loss: 9.5539, total reward: 1.0200
Episo

Episode 8438, loss: 6.0763, total reward: 0.6100
Episode 8439, loss: 7.0410, total reward: 0.2000
Episode 8440, loss: 25.6617, total reward: 0.6100
Episode 8441, loss: -4.8859, total reward: -1.1200
Episode 8442, loss: 6.3473, total reward: 0.3900
Episode 8443, loss: 1.6853, total reward: 0.1900
Episode 8444, loss: 404.9311, total reward: 10.3900
Episode 8445, loss: 17.9642, total reward: 0.7100
Episode 8446, loss: -5.0451, total reward: -0.2400
Episode 8447, loss: 20.4718, total reward: 0.7100
Episode 8448, loss: 18.0708, total reward: 0.7100
Episode 8449, loss: 18.8196, total reward: 0.6100
Episode 8450, loss: 6.1364, total reward: 0.4000
Episode 8451, loss: 4.4458, total reward: 0.0900
Episode 8452, loss: 11.2570, total reward: 0.4000
Episode 8453, loss: 419.1392, total reward: 10.9500
Episode 8454, loss: 10.6298, total reward: 0.4000
Episode 8455, loss: 340.2365, total reward: 10.5900
Episode 8456, loss: 6.5728, total reward: 0.2000
Episode 8457, loss: 5.7703, total reward: 0.7000


Episode 8602, loss: 322.4901, total reward: 10.6400
Episode 8603, loss: 298.5258, total reward: 10.4700
Episode 8604, loss: 7.7614, total reward: 0.8100
Episode 8605, loss: 17.2760, total reward: 0.7100
Episode 8606, loss: 12.1931, total reward: 0.8100
Episode 8607, loss: 390.2918, total reward: 10.9200
Episode 8608, loss: 6.8335, total reward: 0.6100
Episode 8609, loss: 18.9189, total reward: 1.0200
Episode 8610, loss: 353.0122, total reward: 10.2100
Episode 8611, loss: 26.4440, total reward: 1.2200
Episode 8612, loss: 398.5760, total reward: 10.9300
Episode 8613, loss: 346.7159, total reward: 11.0500
Episode 8614, loss: 4.5614, total reward: 0.3000
Episode 8615, loss: 14.8462, total reward: 0.7700
Episode 8616, loss: 363.8192, total reward: 10.6600
Episode 8617, loss: 16.7543, total reward: 1.1200
Episode 8618, loss: 347.7448, total reward: 10.6200
Episode 8619, loss: 15.8480, total reward: 0.8100
Episode 8620, loss: 11.0318, total reward: 0.8100
Episode 8621, loss: 2.2866, total rew

Episode 8765, loss: 11.9158, total reward: 0.7000
Episode 8766, loss: 3.7512, total reward: 0.4000
Episode 8767, loss: 9.6087, total reward: 0.6100
Episode 8768, loss: 248.7309, total reward: 10.3100
Episode 8769, loss: 8.7480, total reward: 0.8100
Episode 8770, loss: -7.5026, total reward: -0.0100
Episode 8771, loss: 197.5176, total reward: 10.4300
Episode 8772, loss: 8.1355, total reward: 0.7100
Episode 8773, loss: 10.7091, total reward: 1.0200
Episode 8774, loss: 18.7264, total reward: 1.3200
Episode 8775, loss: 3.2435, total reward: 0.3300
Episode 8776, loss: 344.3057, total reward: 10.4500
Episode 8777, loss: 386.4619, total reward: 10.9400
Episode 8778, loss: 13.4293, total reward: 1.1600
Episode 8779, loss: -6.8249, total reward: -0.0100
Episode 8780, loss: 287.2036, total reward: 10.8500
Episode 8781, loss: 22.5643, total reward: 1.0000
Episode 8782, loss: 2.1887, total reward: 0.3000
Episode 8783, loss: 12.0262, total reward: 0.5000
Episode 8784, loss: 350.1885, total reward: 

Episode 8928, loss: 404.9117, total reward: 10.5800
Episode 8929, loss: 281.8673, total reward: 10.8700
Episode 8930, loss: 316.0353, total reward: 10.7100
Episode 8931, loss: 2.5429, total reward: 0.4500
Episode 8932, loss: 404.7154, total reward: 10.8700
Episode 8933, loss: 323.6533, total reward: 10.5200
Episode 8934, loss: 3.4076, total reward: 0.7100
Episode 8935, loss: -0.2217, total reward: 0.1500
Episode 8936, loss: 335.3756, total reward: 10.6600
Episode 8937, loss: 368.8226, total reward: 10.4500
Episode 8938, loss: 225.1572, total reward: 10.7600
Episode 8939, loss: 11.6403, total reward: 0.7500
Episode 8940, loss: 15.1166, total reward: 0.7100
Episode 8941, loss: 232.6169, total reward: 10.5700
Episode 8942, loss: 355.5169, total reward: 10.9800
Episode 8943, loss: 4.2492, total reward: 0.4000
Episode 8944, loss: 5.5151, total reward: 0.8100
Episode 8945, loss: 10.8436, total reward: 0.6600
Episode 8946, loss: 20.8516, total reward: 1.2200
Episode 8947, loss: 2.1905, total 

Episode 9088, loss: 8.2489, total reward: 0.5000
Episode 9089, loss: 401.1524, total reward: 10.9200
Episode 9090, loss: 11.5122, total reward: 0.6800
Episode 9091, loss: 374.6037, total reward: 10.9400
Episode 9092, loss: 427.3425, total reward: 11.0700
Episode 9093, loss: 1.6442, total reward: 0.3000
Episode 9094, loss: 422.8432, total reward: 10.8200
Episode 9095, loss: 4.8067, total reward: 0.5000
Episode 9096, loss: 292.2606, total reward: 10.5300
Episode 9097, loss: 406.4485, total reward: 10.8400
Episode 9098, loss: 224.1617, total reward: 10.3500
Episode 9099, loss: 378.8236, total reward: 10.7100
Episode 9100, loss: 13.4084, total reward: 0.6100
Episode 9101, loss: 24.2718, total reward: 0.9100
Episode 9102, loss: 287.9417, total reward: 10.1600
Episode 9103, loss: 21.7034, total reward: 1.0200
Episode 9104, loss: 271.3450, total reward: 10.4300
Episode 9105, loss: 120.4624, total reward: 10.0500
Episode 9106, loss: 11.3113, total reward: 0.6100
Episode 9107, loss: 8.9048, tot

Episode 9250, loss: 20.1411, total reward: 1.0200
Episode 9251, loss: 259.1786, total reward: 10.1800
Episode 9252, loss: 397.8271, total reward: 10.4100
Episode 9253, loss: 295.5074, total reward: 10.8900
Episode 9254, loss: 17.6052, total reward: 0.8100
Episode 9255, loss: 11.8931, total reward: 0.7100
Episode 9256, loss: 412.9704, total reward: 10.8500
Episode 9257, loss: 12.4560, total reward: 0.6100
Episode 9258, loss: 193.5136, total reward: 10.6000
Episode 9259, loss: 5.0137, total reward: 0.4000
Episode 9260, loss: 443.4702, total reward: 10.7100
Episode 9261, loss: 16.0245, total reward: 0.9100
Episode 9262, loss: 14.4974, total reward: 0.9100
Episode 9263, loss: 21.0261, total reward: 1.1200
Episode 9264, loss: 1.5900, total reward: 0.2000
Episode 9265, loss: 357.5265, total reward: 10.3000
Episode 9266, loss: 13.1261, total reward: 0.6900
Episode 9267, loss: 331.6938, total reward: 10.5400
Episode 9268, loss: 11.9518, total reward: 0.4000
Episode 9269, loss: 478.1418, total 

Episode 9413, loss: 377.8770, total reward: 10.4300
Episode 9414, loss: 12.0232, total reward: 0.6100
Episode 9415, loss: 19.0598, total reward: 0.9100
Episode 9416, loss: 397.4279, total reward: 11.1500
Episode 9417, loss: 17.3232, total reward: 0.6100
Episode 9418, loss: 364.9455, total reward: 10.1700
Episode 9419, loss: 339.3853, total reward: 10.7300
Episode 9420, loss: 27.7815, total reward: 1.2100
Episode 9421, loss: 20.3327, total reward: 1.0200
Episode 9422, loss: 329.9977, total reward: 10.6400
Episode 9423, loss: 15.7027, total reward: 1.2200
Episode 9424, loss: 22.6095, total reward: 0.8100
Episode 9425, loss: 20.6382, total reward: 1.2200
Episode 9426, loss: 493.1225, total reward: 11.1000
Episode 9427, loss: 417.7303, total reward: 11.0400
Episode 9428, loss: 11.0971, total reward: 0.8100
Episode 9429, loss: 5.5098, total reward: 0.1800
Episode 9430, loss: 13.9053, total reward: 0.7100
Episode 9431, loss: 397.3907, total reward: 11.0300
Episode 9432, loss: 405.8262, total

Episode 9575, loss: 315.6824, total reward: 10.4200
Episode 9576, loss: 16.3779, total reward: 0.9100
Episode 9577, loss: 12.0849, total reward: 0.8100
Episode 9578, loss: 295.2998, total reward: 10.4700
Episode 9579, loss: 360.8483, total reward: 10.2600
Episode 9580, loss: 382.2279, total reward: 10.4300
Episode 9581, loss: 8.0845, total reward: 0.8100
Episode 9582, loss: 18.5445, total reward: 0.9100
Episode 9583, loss: 10.7119, total reward: 1.0800
Episode 9584, loss: 20.0329, total reward: 1.0200
Episode 9585, loss: 23.8107, total reward: 1.1000
Episode 9586, loss: 352.5722, total reward: 10.9600
Episode 9587, loss: 500.5046, total reward: 10.8600
Episode 9588, loss: 302.1758, total reward: 10.6400
Episode 9589, loss: 27.6344, total reward: 1.3200
Episode 9590, loss: 28.7180, total reward: 1.2200
Episode 9591, loss: 434.5637, total reward: 11.0600
Episode 9592, loss: 397.9453, total reward: 11.0500
Episode 9593, loss: 13.4911, total reward: 0.8100
Episode 9594, loss: 2.8668, total

Episode 9737, loss: 7.5099, total reward: 0.5000
Episode 9738, loss: 391.6208, total reward: 10.7300
Episode 9739, loss: 419.0998, total reward: 10.9600
Episode 9740, loss: 297.7206, total reward: 10.6800
Episode 9741, loss: 21.4754, total reward: 1.0100
Episode 9742, loss: 288.3954, total reward: 10.5100
Episode 9743, loss: 2.9044, total reward: 0.4000
Episode 9744, loss: 28.8934, total reward: 1.2200
Episode 9745, loss: 8.4399, total reward: 0.8100
Episode 9746, loss: 0.8482, total reward: 0.0900
Episode 9747, loss: 12.3558, total reward: 0.7100
Episode 9748, loss: 14.6904, total reward: 0.8100
Episode 9749, loss: 280.3533, total reward: 10.4100
Episode 9750, loss: -5.9677, total reward: -0.0300
Episode 9751, loss: 3.3839, total reward: 0.3000
Episode 9752, loss: 22.5928, total reward: 1.0800
Episode 9753, loss: 378.2153, total reward: 10.4100
Episode 9754, loss: 352.2977, total reward: 10.7800
Episode 9755, loss: 7.1783, total reward: 0.5100
Episode 9756, loss: -2.5286, total reward

Episode 9899, loss: 2.8822, total reward: 0.4000
Episode 9900, loss: 6.7354, total reward: 0.4000
Episode 9901, loss: 14.0420, total reward: 1.1200
Episode 9902, loss: 16.5277, total reward: 0.9100
Episode 9903, loss: 5.2984, total reward: 0.5000
Episode 9904, loss: 22.1081, total reward: 1.1200
Episode 9905, loss: 9.8264, total reward: 0.6100
Episode 9906, loss: 432.0928, total reward: 11.1000
Episode 9907, loss: 1.8379, total reward: 0.3000
Episode 9908, loss: 267.5060, total reward: 10.2800
Episode 9909, loss: 5.0294, total reward: 0.4900
Episode 9910, loss: 12.8839, total reward: 0.7100
Episode 9911, loss: 13.0628, total reward: 0.8700
Episode 9912, loss: -1.5055, total reward: 0.4000
Episode 9913, loss: 344.6174, total reward: 10.3900
Episode 9914, loss: 400.6361, total reward: 10.3100
Episode 9915, loss: 11.0339, total reward: 0.7000
Episode 9916, loss: 308.0763, total reward: 10.5300
Episode 9917, loss: 12.8004, total reward: 1.0200
Episode 9918, loss: -4.1756, total reward: 0.0