In [1]:
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import random

MIN_LIST_LEN = 6
MAX_LIST_LEN = 6
MAX_STEPS = 120

SUCCESS_REWARD = 2.0
STEP_REWARD = -0.1
SWAP_REWARD = 1.0
INVALID_ACTION_REWARD = -10.0

EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
GAMMA = 0.8
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = 'datasets/rl_sort_transformer_easy/list6_transformer3_128_gamma098_step120_v1'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    '0': 2,
    '1': 3,
    '2': 4,
    '3': 5,
    '4': 6,
    '5': 7,
    '6': 8,
    '7': 9,
    'less': 10,
    'equal': 11,
    'more': 12,
    'len1': 13,
    'len2': 14,
    'len3': 15,
    'len4': 16,
    'len5': 17,
    'len6': 18,
    'len7': 19,
    'len8': 20,
}
inv_vocab = {v: k for k, v in vocab.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        initial_token = 'len{}'.format(self.length)
        return vocab[initial_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_list_len(self):
        return len(self.list)

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 >= self.length or index2 >= self.length or index1 < 0 or index2 < 0:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = STEP_REWARD
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = STEP_REWARD * 2
            else:
                response_token = vocab['more']
                reward = STEP_REWARD
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = SWAP_REWARD
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -SWAP_REWARD
            else:
                reward = STEP_REWARD
            self.indices = None
        else:
            reward = INVALID_ACTION_REWARD
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [None]:
# Training Loop
def train(verbose=False):
    # Removed torch.autograd.set_detect_anomaly(True)
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    # Optionally, load a checkpoint
    # load_checkpoint("path_to_checkpoint", model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    num_successes = 0
    total_stesp = 0
    
    for episode in range(NUM_EPISODES):
        t1 = time.time()
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, current_list = env.reset()
        input_tokens = [initial_token_id]
        log_probs = []
        rewards = []
        
        state = 'expect_action'
        done = False
        success = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
                output = model(input_seq)  # (seq_len, batch_size, vocab_size)
                # Get logits for the last token
                logits = output[-1, 0, :]  # (vocab_size)

                # Check for NaNs in logits
                if torch.isnan(logits).any():
                    print(f"Episode {episode}, NaNs in logits before masking.")
                    break

                # Get valid tokens based on state
                def get_valid_tokens(state):
                    action_tokens = [vocab['Comparison'], vocab['Swap']]
                    index_tokens = [vocab[str(i)] for i in range(env.length)]
                    if state == 'expect_action':
                        return action_tokens
                    elif state == 'expect_index1':
                        return index_tokens[:-1]
                    elif state == 'expect_index2':
                        return [x for x in index_tokens if x > input_tokens[-1]]
                    else:
                        # Handle unexpected states by defaulting to expect_action
                        return action_tokens

                valid_token_ids = get_valid_tokens(state)

                # Ensure valid_token_ids are within the vocab range
                if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                    print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                    break

                # Mask invalid tokens
                mask_value = -1e9  # Use a large negative value instead of -inf
                mask = torch.full_like(logits, mask_value).to(device)
                mask[valid_token_ids] = 0
                masked_logits = logits + mask

                # Sample action. Have some chance to randomly pick a valid action.
                eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
                if random.random() < eps_threshold:
                    masked_logits = masked_logits / 4

                # Check for NaNs in masked_logits
                if torch.isnan(masked_logits).any():
                    print(f"Episode {episode}, NaNs in masked_logits after masking.")
                    break

                # Compute probabilities
                probs = F.softmax(masked_logits, dim=0)

                # Check for NaNs in probs
                if torch.isnan(probs).any():
                    print(f"Episode {episode}, NaNs in probs after softmax.")
                    break

                try:
                    m = torch.distributions.Categorical(probs)
                    action_token = m.sample()
                    log_prob = m.log_prob(action_token)
                except ValueError as e:
                    print(f"Episode {episode}, error in sampling action: {e}")
                    break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        reward = INVALID_ACTION_REWARD
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if done and reward == SUCCESS_REWARD:
                        success = True
                    if verbose:
                        print("Reward:", reward)
                    state = 'expect_action'
                else:
                    reward = INVALID_ACTION_REWARD
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                response_token, reward, done, current_list = env.step(action_tokens)
                if done and reward == SUCCESS_REWARD:
                    success = True
                if verbose:
                    print("Reward:", reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                reward = INVALID_ACTION_REWARD
                done = True

            rewards.append(reward)
        #
        if success: 
            num_successes += 1
        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / episode_cnt
            success_rate = num_successes / episode_cnt
            avg_steps = total_steps / episode_cnt
            episode_cnt = 0
            total_reward = 0.0
            num_successes = 0
            total_steps = 0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{success_rate:.4f}_{avg_steps:.2f}.pth")
        #
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"

        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns and loss within autocast
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            # Compute returns
            returns = []
            R = 0
            gamma = GAMMA
            for r in rewards[::-1]:
                R = r + gamma * R
                returns.insert(0, R)
            returns = torch.tensor(returns).to(device)

            # Check for NaNs in returns
            if torch.isnan(returns).any():
                print(f"Episode {episode}, NaNs in returns.")
                continue

            # Compute loss
            loss = 0
            for log_prob, R in zip(log_probs, returns):
                loss -= log_prob * R

            # Check for NaNs in loss
            if torch.isnan(loss):
                print(f"Episode {episode}, NaN in loss.")
                continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        total_steps += len(rewards)
        t2 = time.time()
        if episode % 1 == 0:
            print(f"Episode {episode}, loss:{loss.item():.4f}, {'succeed' if success else 'fail'}, steps:{len(rewards)}, total reward:{sum(rewards):.4f}, {t2-t1} sec")

if __name__ == "__main__":
    train(verbose=False)




Episode 0, loss:-56.5354, fail, steps:26, total reward:-11.8000, 0.4371922016143799 sec
Episode 1, loss:-7.0741, fail, steps:1, total reward:-10.0000, 0.005114078521728516 sec
Episode 2, loss:-59.6471, fail, steps:26, total reward:-12.7000, 0.05188727378845215 sec
Episode 3, loss:-47.3797, fail, steps:9, total reward:-10.2000, 0.018344640731811523 sec
Episode 4, loss:-49.1862, fail, steps:34, total reward:-10.0000, 0.06811308860778809 sec
Episode 5, loss:-8.9220, fail, steps:1, total reward:-10.0000, 0.003808259963989258 sec
Episode 6, loss:-49.8247, fail, steps:17, total reward:-11.5000, 0.03262758255004883 sec
Episode 7, loss:-8.7725, fail, steps:91, total reward:-1.8000, 0.20496225357055664 sec
Episode 8, loss:-19.1759, fail, steps:90, total reward:-3.9000, 0.18864917755126953 sec
Episode 9, loss:-13.5208, fail, steps:1, total reward:-10.0000, 0.007322072982788086 sec
Episode 10, loss:-12.0954, fail, steps:90, total reward:-2.9000, 0.17507171630859375 sec
Episode 11, loss:-11.3962, 

Episode 94, loss:-55.0067, fail, steps:60, total reward:-5.7000, 0.125762939453125 sec
Episode 95, loss:-23.8561, fail, steps:1, total reward:-10.0000, 0.006243705749511719 sec
Episode 96, loss:6.9020, succeed, steps:82, total reward:3.5000, 0.1584022045135498 sec
Episode 97, loss:-2.7751, fail, steps:92, total reward:1.2000, 0.18661880493164062 sec
Episode 98, loss:10.4001, fail, steps:92, total reward:3.2000, 0.1897110939025879 sec
Episode 99, loss:-81.3177, fail, steps:43, total reward:-10.3000, 0.09238481521606445 sec
Episode 100, loss:3.8695, fail, steps:91, total reward:3.2000, 0.18343639373779297 sec
Episode 101, loss:-41.9504, fail, steps:48, total reward:-6.4000, 0.10259199142456055 sec
Episode 102, loss:3.3006, fail, steps:91, total reward:2.2000, 0.18774700164794922 sec
Episode 103, loss:0.0317, fail, steps:92, total reward:2.3000, 0.190887451171875 sec
Episode 104, loss:10.0640, succeed, steps:72, total reward:4.7000, 0.15116429328918457 sec
Episode 105, loss:0.7365, fail, 

Episode 187, loss:-5.5013, fail, steps:90, total reward:0.1000, 0.22359466552734375 sec
Episode 188, loss:13.3439, succeed, steps:84, total reward:5.5000, 0.17084598541259766 sec
Episode 189, loss:24.2125, succeed, steps:73, total reward:8.9000, 0.15061402320861816 sec
Episode 190, loss:18.1315, succeed, steps:86, total reward:6.4000, 0.17567920684814453 sec
Episode 191, loss:8.0054, succeed, steps:55, total reward:3.3000, 0.11604475975036621 sec
Episode 192, loss:6.5158, succeed, steps:54, total reward:2.3000, 0.11428213119506836 sec
Episode 193, loss:5.7710, succeed, steps:30, total reward:3.1000, 0.062325477600097656 sec
Episode 194, loss:11.3322, succeed, steps:56, total reward:4.3000, 0.11161494255065918 sec
Episode 195, loss:-47.7976, fail, steps:5, total reward:-9.1000, 0.013273477554321289 sec
Episode 196, loss:12.6555, succeed, steps:33, total reward:6.1000, 0.06359076499938965 sec
Episode 197, loss:12.2167, fail, steps:92, total reward:5.3000, 0.1831493377685547 sec
Episode 1

Episode 279, loss:-1.6763, fail, steps:91, total reward:1.2000, 0.18364310264587402 sec
Episode 280, loss:16.6226, succeed, steps:60, total reward:6.3000, 0.12438821792602539 sec
Episode 281, loss:-49.7493, fail, steps:22, total reward:-7.6000, 0.047635555267333984 sec
Episode 282, loss:-0.4283, fail, steps:91, total reward:1.2000, 0.18080615997314453 sec
Episode 283, loss:8.4619, fail, steps:92, total reward:4.3000, 0.18828845024108887 sec
Episode 284, loss:4.2992, succeed, steps:11, total reward:2.7000, 0.028060436248779297 sec
Episode 285, loss:12.7682, succeed, steps:76, total reward:5.7000, 0.14895033836364746 sec
Episode 286, loss:5.1742, fail, steps:91, total reward:2.2000, 0.18470191955566406 sec
Episode 287, loss:5.8176, succeed, steps:42, total reward:2.7000, 0.08843111991882324 sec
Episode 288, loss:10.4529, fail, steps:91, total reward:4.2000, 0.1829066276550293 sec
Episode 289, loss:-39.9328, fail, steps:5, total reward:-9.1000, 0.015851259231567383 sec
Episode 290, loss:1

Episode 371, loss:-55.3964, fail, steps:32, total reward:-8.9000, 0.0705251693725586 sec
Episode 372, loss:6.4455, fail, steps:91, total reward:3.2000, 0.1826639175415039 sec
Episode 373, loss:-78.1991, fail, steps:61, total reward:-11.1000, 0.12791967391967773 sec
Episode 374, loss:14.2620, succeed, steps:82, total reward:6.6000, 0.16784262657165527 sec
Episode 375, loss:-3.9329, fail, steps:91, total reward:0.9000, 0.18787813186645508 sec
Episode 376, loss:-2.8870, fail, steps:91, total reward:2.2000, 0.18933367729187012 sec
Episode 377, loss:-2.1560, fail, steps:90, total reward:0.1000, 0.18835687637329102 sec
Episode 378, loss:10.8516, succeed, steps:45, total reward:5.7000, 0.09647130966186523 sec
Episode 379, loss:13.1225, succeed, steps:59, total reward:7.3000, 0.12085628509521484 sec
Episode 380, loss:11.2962, succeed, steps:79, total reward:5.6000, 0.16044163703918457 sec
Episode 381, loss:13.9154, fail, steps:92, total reward:6.3000, 0.1935107707977295 sec
Episode 382, loss:1

Episode 463, loss:16.2397, succeed, steps:69, total reward:8.0000, 0.14427781105041504 sec
Episode 464, loss:10.0741, succeed, steps:54, total reward:5.3000, 0.11172723770141602 sec
Episode 465, loss:7.0152, fail, steps:92, total reward:5.2000, 0.18773293495178223 sec
Episode 466, loss:14.5695, succeed, steps:62, total reward:7.2000, 0.13334870338439941 sec
Episode 467, loss:14.2974, fail, steps:92, total reward:7.3000, 0.189042329788208 sec
Episode 468, loss:13.9167, fail, steps:93, total reward:7.3000, 0.19345426559448242 sec
Episode 469, loss:11.3093, succeed, steps:64, total reward:6.1000, 0.13612151145935059 sec
Episode 470, loss:8.2356, succeed, steps:74, total reward:3.7000, 0.15274810791015625 sec
Episode 471, loss:6.9624, fail, steps:91, total reward:4.2000, 0.18796420097351074 sec
Episode 472, loss:6.1582, succeed, steps:78, total reward:4.6000, 0.1632084846496582 sec
Episode 473, loss:6.1647, succeed, steps:78, total reward:4.6000, 0.1630570888519287 sec
Episode 474, loss:5.

Episode 556, loss:-5.0738, fail, steps:91, total reward:0.2000, 0.1890885829925537 sec
Episode 557, loss:5.7904, fail, steps:91, total reward:3.2000, 0.190765380859375 sec
Episode 558, loss:17.6057, fail, steps:92, total reward:8.3000, 0.19260358810424805 sec
Episode 559, loss:-0.4086, fail, steps:90, total reward:-0.0000, 0.18845152854919434 sec
Episode 560, loss:6.7282, fail, steps:91, total reward:4.2000, 0.19122838973999023 sec
Episode 561, loss:12.0482, succeed, steps:53, total reward:7.5000, 0.11351537704467773 sec
Episode 562, loss:8.1184, succeed, steps:34, total reward:4.0000, 0.07206296920776367 sec
Episode 563, loss:2.2579, fail, steps:91, total reward:2.2000, 0.18584418296813965 sec
Episode 564, loss:2.2394, succeed, steps:31, total reward:1.0000, 0.06655001640319824 sec
Episode 565, loss:-0.8019, fail, steps:91, total reward:2.2000, 0.18201518058776855 sec
Episode 566, loss:10.9428, succeed, steps:82, total reward:5.5000, 0.1691732406616211 sec
Episode 567, loss:7.5729, su

Episode 650, loss:4.5681, fail, steps:91, total reward:3.2000, 0.1808769702911377 sec
Episode 651, loss:5.6365, succeed, steps:36, total reward:2.9000, 0.07596302032470703 sec
Episode 652, loss:-2.0978, fail, steps:90, total reward:0.1000, 0.17999053001403809 sec
Episode 653, loss:12.5168, succeed, steps:46, total reward:6.7000, 0.09610676765441895 sec
Episode 654, loss:12.0626, succeed, steps:40, total reward:6.9000, 0.08126330375671387 sec
Episode 655, loss:-44.5494, fail, steps:65, total reward:-4.9000, 0.1311802864074707 sec
Episode 656, loss:8.4082, fail, steps:92, total reward:5.2000, 0.1859283447265625 sec
Episode 657, loss:9.9421, succeed, steps:62, total reward:4.1000, 0.12970614433288574 sec
Episode 658, loss:12.5044, succeed, steps:72, total reward:7.9000, 0.1468663215637207 sec
Episode 659, loss:9.1032, succeed, steps:78, total reward:5.7000, 0.16069912910461426 sec
Episode 660, loss:12.6902, fail, steps:92, total reward:5.2000, 0.1930992603302002 sec
Episode 661, loss:9.47

Episode 742, loss:5.1946, succeed, steps:80, total reward:4.6000, 0.15903973579406738 sec
Episode 743, loss:3.8802, fail, steps:91, total reward:2.2000, 0.18669652938842773 sec
Episode 744, loss:15.6951, succeed, steps:82, total reward:8.6000, 0.17013883590698242 sec
Episode 745, loss:5.7263, fail, steps:91, total reward:4.2000, 0.18893003463745117 sec
Episode 746, loss:4.3139, succeed, steps:7, total reward:1.8000, 0.020335912704467773 sec
Episode 747, loss:0.8090, fail, steps:91, total reward:1.1000, 0.17808794975280762 sec
Episode 748, loss:1.8276, succeed, steps:66, total reward:3.0000, 0.1353163719177246 sec
Episode 749, loss:-7.7916, fail, steps:90, total reward:-2.8000, 0.18209505081176758 sec
Episode 750, loss:11.7258, succeed, steps:77, total reward:6.7000, 0.1593308448791504 sec
Episode 751, loss:14.1927, fail, steps:92, total reward:7.3000, 0.18905210494995117 sec
Episode 752, loss:11.1156, succeed, steps:64, total reward:6.1000, 0.13480591773986816 sec
Episode 753, loss:9.2

Episode 835, loss:2.4306, succeed, steps:70, total reward:2.8000, 0.1471390724182129 sec
Episode 836, loss:11.0915, succeed, steps:70, total reward:5.9000, 0.14357995986938477 sec
Episode 837, loss:8.6605, succeed, steps:46, total reward:3.6000, 0.09514021873474121 sec
Episode 838, loss:4.7637, fail, steps:91, total reward:4.2000, 0.18303704261779785 sec
Episode 839, loss:11.7288, succeed, steps:38, total reward:4.9000, 0.08193707466125488 sec
Episode 840, loss:10.7081, succeed, steps:74, total reward:6.8000, 0.14754152297973633 sec
Episode 841, loss:6.0369, succeed, steps:46, total reward:3.6000, 0.09509873390197754 sec
Episode 842, loss:15.9944, succeed, steps:62, total reward:7.2000, 0.12450838088989258 sec
Episode 843, loss:10.0785, fail, steps:92, total reward:6.3000, 0.18698787689208984 sec
Episode 844, loss:10.5484, succeed, steps:75, total reward:4.7000, 0.15535306930541992 sec
Episode 845, loss:16.8738, succeed, steps:86, total reward:9.5000, 0.17769408226013184 sec
Episode 84

Episode 927, loss:7.3161, fail, steps:92, total reward:5.2000, 0.1809556484222412 sec
Episode 928, loss:17.1015, succeed, steps:92, total reward:9.3000, 0.18730521202087402 sec
Episode 929, loss:3.8605, succeed, steps:16, total reward:4.6000, 0.03882765769958496 sec
Episode 930, loss:8.6484, succeed, steps:60, total reward:5.2000, 0.1173865795135498 sec
Episode 931, loss:5.6624, succeed, steps:50, total reward:4.5000, 0.10056376457214355 sec
Episode 932, loss:10.0605, succeed, steps:73, total reward:5.8000, 0.1457836627960205 sec
Episode 933, loss:8.1817, fail, steps:92, total reward:5.2000, 0.18738436698913574 sec
Episode 934, loss:11.2224, succeed, steps:88, total reward:6.4000, 0.18408203125 sec
Episode 935, loss:4.8892, succeed, steps:28, total reward:4.2000, 0.06325960159301758 sec
Episode 936, loss:10.3899, fail, steps:92, total reward:7.3000, 0.18259692192077637 sec
Episode 937, loss:6.3976, succeed, steps:54, total reward:2.0000, 0.11262655258178711 sec
Episode 938, loss:5.3769

Episode 1019, loss:-2.1848, fail, steps:92, total reward:3.2000, 0.18745875358581543 sec
Episode 1020, loss:2.9184, fail, steps:92, total reward:4.3000, 0.1899101734161377 sec
Episode 1021, loss:9.9922, succeed, steps:76, total reward:5.7000, 0.15968990325927734 sec
Episode 1022, loss:8.7659, succeed, steps:61, total reward:6.2000, 0.1267993450164795 sec
Episode 1023, loss:2.6708, succeed, steps:13, total reward:1.6000, 0.030118703842163086 sec
Episode 1024, loss:6.1723, succeed, steps:47, total reward:4.6000, 0.09170794486999512 sec
Episode 1025, loss:18.6414, succeed, steps:76, total reward:8.8000, 0.15078234672546387 sec
Episode 1026, loss:5.7221, succeed, steps:17, total reward:2.5000, 0.03908586502075195 sec
Episode 1027, loss:9.5513, fail, steps:92, total reward:6.3000, 0.18232035636901855 sec
Episode 1028, loss:9.6334, succeed, steps:46, total reward:6.7000, 0.09514760971069336 sec
Episode 1029, loss:2.9762, succeed, steps:17, total reward:2.3000, 0.03627347946166992 sec
Episode

Episode 1110, loss:9.6757, fail, steps:92, total reward:8.3000, 0.18481230735778809 sec
Episode 1111, loss:11.0918, succeed, steps:68, total reward:7.0000, 0.14089488983154297 sec
Episode 1112, loss:4.2675, fail, steps:92, total reward:5.2000, 0.18924307823181152 sec
Episode 1113, loss:5.3123, succeed, steps:68, total reward:3.9000, 0.14162158966064453 sec
Episode 1114, loss:2.1933, succeed, steps:42, total reward:2.7000, 0.0879521369934082 sec
Episode 1115, loss:-44.1715, fail, steps:20, total reward:-6.5000, 0.04230070114135742 sec
Episode 1116, loss:14.9877, succeed, steps:80, total reward:9.6000, 0.15682768821716309 sec
Episode 1117, loss:4.8150, succeed, steps:74, total reward:3.7000, 0.1509385108947754 sec
Episode 1118, loss:3.5357, succeed, steps:81, total reward:4.4000, 0.16618824005126953 sec
Episode 1119, loss:13.4549, succeed, steps:73, total reward:8.9000, 0.15266990661621094 sec
Episode 1120, loss:4.4500, fail, steps:91, total reward:3.2000, 0.18772673606872559 sec
Episode

Episode 1201, loss:6.0444, succeed, steps:55, total reward:6.4000, 0.11404037475585938 sec
Episode 1202, loss:3.0223, succeed, steps:91, total reward:5.2000, 0.1842024326324463 sec
Episode 1203, loss:3.4032, succeed, steps:7, total reward:1.8000, 0.020119667053222656 sec
Episode 1204, loss:4.7989, succeed, steps:82, total reward:5.5000, 0.15918779373168945 sec
Episode 1205, loss:4.5135, succeed, steps:49, total reward:3.5000, 0.1005096435546875 sec
Episode 1206, loss:0.8128, fail, steps:91, total reward:4.2000, 0.18200039863586426 sec
Episode 1207, loss:5.0312, fail, steps:91, total reward:4.2000, 0.186967134475708 sec
Episode 1208, loss:8.1622, fail, steps:92, total reward:6.3000, 0.1903095245361328 sec
Episode 1209, loss:4.4907, succeed, steps:32, total reward:5.1000, 0.06979656219482422 sec
Episode 1210, loss:5.0274, succeed, steps:45, total reward:5.7000, 0.09019589424133301 sec
Episode 1211, loss:3.7994, succeed, steps:24, total reward:3.3000, 0.04900026321411133 sec
Episode 1212,

Episode 1292, loss:12.6766, succeed, steps:55, total reward:9.5000, 0.11415886878967285 sec
Episode 1293, loss:0.4787, fail, steps:92, total reward:6.3000, 0.1848926544189453 sec
Episode 1294, loss:4.8729, succeed, steps:83, total reward:6.5000, 0.17292499542236328 sec
Episode 1295, loss:6.6863, succeed, steps:37, total reward:7.0000, 0.08125472068786621 sec
Episode 1296, loss:8.3417, succeed, steps:81, total reward:10.7000, 0.16280698776245117 sec
Episode 1297, loss:2.3060, succeed, steps:51, total reward:2.4000, 0.10483813285827637 sec
Episode 1298, loss:7.7751, fail, steps:92, total reward:8.3000, 0.18580031394958496 sec
Episode 1299, loss:2.3574, succeed, steps:39, total reward:2.8000, 0.08245515823364258 sec
Episode 1300, loss:3.9930, succeed, steps:26, total reward:5.3000, 0.05370497703552246 sec
Episode 1301, loss:0.2585, succeed, steps:90, total reward:1.1000, 0.17813777923583984 sec
Episode 1302, loss:6.9002, succeed, steps:79, total reward:8.7000, 0.1612858772277832 sec
Episo

Episode 1383, loss:6.4970, succeed, steps:56, total reward:7.4000, 0.11240553855895996 sec
Episode 1384, loss:5.6233, succeed, steps:46, total reward:3.6000, 0.09233283996582031 sec
Episode 1385, loss:5.3004, fail, steps:92, total reward:6.3000, 0.184234619140625 sec
Episode 1386, loss:6.0978, succeed, steps:53, total reward:4.4000, 0.10989928245544434 sec
Episode 1387, loss:5.3928, succeed, steps:69, total reward:8.0000, 0.1403825283050537 sec
Episode 1388, loss:3.7623, succeed, steps:55, total reward:4.4000, 0.11214613914489746 sec
Episode 1389, loss:-0.2435, fail, steps:91, total reward:1.1000, 0.18643712997436523 sec
Episode 1390, loss:5.4377, succeed, steps:73, total reward:8.9000, 0.1531832218170166 sec
Episode 1391, loss:5.0467, succeed, steps:63, total reward:5.1000, 0.13048028945922852 sec
Episode 1392, loss:5.2395, succeed, steps:54, total reward:5.4000, 0.11048555374145508 sec
Episode 1393, loss:5.7502, succeed, steps:62, total reward:7.2000, 0.12585020065307617 sec
Episode 

Episode 1476, loss:9.4222, succeed, steps:78, total reward:7.7000, 0.15399384498596191 sec
Episode 1477, loss:5.3037, succeed, steps:44, total reward:4.7000, 0.0898735523223877 sec
Episode 1478, loss:7.2537, succeed, steps:60, total reward:5.2000, 0.12041258811950684 sec
Episode 1479, loss:2.5406, succeed, steps:54, total reward:5.4000, 0.1134493350982666 sec
Episode 1480, loss:5.5018, succeed, steps:37, total reward:7.0000, 0.07510924339294434 sec
Episode 1481, loss:4.1930, fail, steps:92, total reward:6.3000, 0.1832435131072998 sec
Episode 1482, loss:2.0041, fail, steps:91, total reward:4.2000, 0.18710780143737793 sec
Episode 1483, loss:4.4317, succeed, steps:51, total reward:5.5000, 0.10834503173828125 sec
Episode 1484, loss:8.9250, fail, steps:92, total reward:8.3000, 0.18593406677246094 sec
Episode 1485, loss:4.2177, succeed, steps:44, total reward:4.7000, 0.0936441421508789 sec
Episode 1486, loss:9.1768, succeed, steps:71, total reward:10.0000, 0.14288878440856934 sec
Episode 148

Episode 1568, loss:1.9883, succeed, steps:43, total reward:3.7000, 0.09134769439697266 sec
Episode 1569, loss:4.4453, succeed, steps:85, total reward:5.4000, 0.1703813076019287 sec
Episode 1570, loss:-1.0292, succeed, steps:89, total reward:4.3000, 0.182997465133667 sec
Episode 1571, loss:5.8373, succeed, steps:70, total reward:5.9000, 0.1452193260192871 sec
Episode 1572, loss:3.1176, succeed, steps:63, total reward:8.2000, 0.12982988357543945 sec
Episode 1573, loss:2.1490, succeed, steps:19, total reward:4.5000, 0.04158163070678711 sec
Episode 1574, loss:9.2771, succeed, steps:74, total reward:9.9000, 0.14514827728271484 sec
Episode 1575, loss:3.5677, succeed, steps:52, total reward:6.5000, 0.10673713684082031 sec
Episode 1576, loss:9.0495, succeed, steps:69, total reward:11.1000, 0.13797998428344727 sec
Episode 1577, loss:7.1183, succeed, steps:80, total reward:9.7000, 0.16289734840393066 sec
Episode 1578, loss:2.6194, fail, steps:91, total reward:3.2000, 0.1863236427307129 sec
Episo

Episode 1661, loss:8.2987, succeed, steps:42, total reward:5.8000, 0.08529520034790039 sec
Episode 1662, loss:7.0789, succeed, steps:69, total reward:8.0000, 0.13711190223693848 sec
Episode 1663, loss:6.6511, succeed, steps:67, total reward:9.1000, 0.1361236572265625 sec
Episode 1664, loss:4.7566, succeed, steps:44, total reward:4.6000, 0.08969569206237793 sec
Episode 1665, loss:5.7097, succeed, steps:48, total reward:6.7000, 0.0962362289428711 sec
Episode 1666, loss:5.0205, succeed, steps:85, total reward:5.4000, 0.1697087287902832 sec
Episode 1667, loss:3.8100, succeed, steps:52, total reward:9.6000, 0.10729527473449707 sec
Episode 1668, loss:4.2865, succeed, steps:33, total reward:6.1000, 0.06755232810974121 sec
Episode 1669, loss:5.8728, succeed, steps:64, total reward:9.2000, 0.12734222412109375 sec
Episode 1670, loss:5.3748, succeed, steps:59, total reward:4.2000, 0.11914253234863281 sec
Episode 1671, loss:1.8910, succeed, steps:28, total reward:4.2000, 0.05781865119934082 sec
Ep

Episode 1753, loss:5.1393, succeed, steps:51, total reward:5.5000, 0.10688400268554688 sec
Episode 1754, loss:5.1616, fail, steps:92, total reward:7.3000, 0.18521833419799805 sec
Episode 1755, loss:2.8098, succeed, steps:25, total reward:1.2000, 0.05625796318054199 sec
Episode 1756, loss:4.7332, succeed, steps:30, total reward:6.2000, 0.06319594383239746 sec
Episode 1757, loss:-3.2839, fail, steps:90, total reward:-0.9000, 0.1828303337097168 sec
Episode 1758, loss:3.1929, fail, steps:92, total reward:8.3000, 0.18888640403747559 sec
Episode 1759, loss:4.3459, succeed, steps:52, total reward:6.5000, 0.10966825485229492 sec
Episode 1760, loss:7.1582, succeed, steps:53, total reward:7.5000, 0.10819005966186523 sec
Episode 1761, loss:7.0118, succeed, steps:51, total reward:5.5000, 0.10398316383361816 sec
Episode 1762, loss:0.7326, succeed, steps:30, total reward:3.1000, 0.06185269355773926 sec
Episode 1763, loss:5.2805, succeed, steps:70, total reward:5.9000, 0.13814687728881836 sec
Episode

Episode 1844, loss:-0.6767, fail, steps:92, total reward:5.3000, 0.17908620834350586 sec
Episode 1845, loss:7.5592, succeed, steps:46, total reward:6.7000, 0.14372539520263672 sec
Episode 1846, loss:4.4600, succeed, steps:37, total reward:7.0000, 0.07512617111206055 sec
Episode 1847, loss:2.1937, succeed, steps:68, total reward:7.0000, 0.13393902778625488 sec
Episode 1848, loss:5.0889, succeed, steps:68, total reward:10.1000, 0.13781380653381348 sec
Episode 1849, loss:6.2622, succeed, steps:54, total reward:8.5000, 0.11009645462036133 sec
Episode 1850, loss:3.0580, succeed, steps:38, total reward:4.9000, 0.07731437683105469 sec
Episode 1851, loss:3.0439, succeed, steps:34, total reward:7.1000, 0.0687856674194336 sec
Episode 1852, loss:4.2295, succeed, steps:74, total reward:13.0000, 0.14578700065612793 sec
Episode 1853, loss:7.6154, fail, steps:92, total reward:6.0000, 0.18631553649902344 sec
Episode 1854, loss:0.1123, succeed, steps:14, total reward:2.6000, 0.03337240219116211 sec
Epi

Episode 1937, loss:0.4148, succeed, steps:43, total reward:3.7000, 0.08594608306884766 sec
Episode 1938, loss:10.0709, succeed, steps:66, total reward:11.2000, 0.13110113143920898 sec
Episode 1939, loss:2.0000, succeed, steps:49, total reward:6.6000, 0.09987711906433105 sec
Episode 1940, loss:3.2421, fail, steps:93, total reward:9.3000, 0.18604803085327148 sec
Episode 1941, loss:1.7672, succeed, steps:29, total reward:5.2000, 0.0629281997680664 sec
Episode 1942, loss:4.8714, succeed, steps:35, total reward:5.0000, 0.06989312171936035 sec
Episode 1943, loss:2.8592, succeed, steps:49, total reward:6.6000, 0.09631943702697754 sec
Episode 1944, loss:1.4123, succeed, steps:32, total reward:5.1000, 0.06459856033325195 sec
Episode 1945, loss:2.7791, succeed, steps:35, total reward:5.0000, 0.07177495956420898 sec
Episode 1946, loss:6.0393, succeed, steps:92, total reward:9.3000, 0.18374967575073242 sec
Episode 1947, loss:2.8548, succeed, steps:38, total reward:4.9000, 0.07933235168457031 sec
E

Episode 2026, loss:4.6381, succeed, steps:78, total reward:7.7000, 0.16246271133422852 sec
Episode 2027, loss:-0.6610, fail, steps:91, total reward:1.5000, 0.18721413612365723 sec
Episode 2028, loss:2.0384, succeed, steps:42, total reward:5.8000, 0.09060502052307129 sec
Episode 2029, loss:5.2523, succeed, steps:39, total reward:5.9000, 0.07980585098266602 sec
Episode 2030, loss:2.6823, succeed, steps:81, total reward:4.5000, 0.16172027587890625 sec
Episode 2031, loss:3.9052, succeed, steps:51, total reward:8.6000, 0.1069481372833252 sec
Episode 2032, loss:0.0350, succeed, steps:16, total reward:1.5000, 0.035523176193237305 sec
Episode 2033, loss:3.0181, succeed, steps:61, total reward:6.2000, 0.12212824821472168 sec
Episode 2034, loss:7.3322, succeed, steps:43, total reward:6.8000, 0.0885624885559082 sec
Episode 2035, loss:1.0292, fail, steps:92, total reward:6.3000, 0.1870124340057373 sec
Episode 2036, loss:-0.9508, fail, steps:92, total reward:5.2000, 0.18820595741271973 sec
Episode 

Episode 2117, loss:3.4351, succeed, steps:90, total reward:7.3000, 0.18270015716552734 sec
Episode 2118, loss:0.2037, succeed, steps:37, total reward:3.9000, 0.07822012901306152 sec
Episode 2119, loss:3.7992, succeed, steps:56, total reward:7.4000, 0.1120147705078125 sec
Episode 2120, loss:4.8383, succeed, steps:46, total reward:6.7000, 0.09322094917297363 sec
Episode 2121, loss:0.2879, succeed, steps:21, total reward:3.4000, 0.04374098777770996 sec
Episode 2122, loss:1.8469, succeed, steps:51, total reward:5.4000, 0.09977197647094727 sec
Episode 2123, loss:0.2351, succeed, steps:46, total reward:3.6000, 0.09196972846984863 sec
Episode 2124, loss:4.2249, succeed, steps:80, total reward:9.7000, 0.159196138381958 sec
Episode 2125, loss:2.7204, succeed, steps:51, total reward:5.5000, 0.10818171501159668 sec
Episode 2126, loss:1.5058, succeed, steps:24, total reward:3.3000, 0.05213642120361328 sec
Episode 2127, loss:4.4594, succeed, steps:55, total reward:9.5000, 0.10953021049499512 sec
Ep

Episode 2209, loss:2.4704, succeed, steps:51, total reward:5.5000, 0.10727477073669434 sec
Episode 2210, loss:1.2464, succeed, steps:25, total reward:4.3000, 0.05288243293762207 sec
Episode 2211, loss:4.9469, succeed, steps:89, total reward:9.4000, 0.17639803886413574 sec
Episode 2212, loss:1.5902, fail, steps:92, total reward:6.3000, 0.18790507316589355 sec
Episode 2213, loss:2.9730, succeed, steps:36, total reward:6.0000, 0.07821893692016602 sec
Episode 2214, loss:0.8820, succeed, steps:25, total reward:4.3000, 0.05148577690124512 sec
Episode 2215, loss:2.6646, succeed, steps:33, total reward:6.1000, 0.06498575210571289 sec
Episode 2216, loss:1.6489, succeed, steps:28, total reward:4.2000, 0.05553603172302246 sec
Episode 2217, loss:2.2182, fail, steps:91, total reward:4.2000, 0.18450069427490234 sec
Episode 2218, loss:3.1101, succeed, steps:67, total reward:6.0000, 0.14018654823303223 sec
Episode 2219, loss:0.3121, fail, steps:90, total reward:-0.9000, 0.23315644264221191 sec
Episode

Episode 2300, loss:1.7649, succeed, steps:27, total reward:6.3000, 0.05818486213684082 sec
Episode 2301, loss:4.1536, succeed, steps:59, total reward:7.3000, 0.11663818359375 sec
Episode 2302, loss:4.7359, succeed, steps:59, total reward:7.2000, 0.11892175674438477 sec
Episode 2303, loss:3.6393, succeed, steps:54, total reward:5.4000, 0.10920047760009766 sec
Episode 2304, loss:0.1868, succeed, steps:34, total reward:4.0000, 0.06914448738098145 sec
Episode 2305, loss:-0.5376, succeed, steps:92, total reward:10.4000, 0.18458032608032227 sec
Episode 2306, loss:3.5364, succeed, steps:48, total reward:8.7000, 0.09973573684692383 sec
Episode 2307, loss:5.0224, succeed, steps:61, total reward:6.2000, 0.12378621101379395 sec
Episode 2308, loss:1.3387, succeed, steps:42, total reward:5.8000, 0.08588576316833496 sec
Episode 2309, loss:3.3045, succeed, steps:67, total reward:9.1000, 0.13419890403747559 sec
Episode 2310, loss:1.0694, succeed, steps:4, total reward:1.9000, 0.012352228164672852 sec


Episode 2391, loss:2.9580, succeed, steps:60, total reward:8.3000, 0.12149548530578613 sec
Episode 2392, loss:1.9833, succeed, steps:38, total reward:4.9000, 0.07772040367126465 sec
Episode 2393, loss:1.6802, succeed, steps:37, total reward:3.8000, 0.07414054870605469 sec
Episode 2394, loss:1.5349, succeed, steps:33, total reward:6.1000, 0.0658559799194336 sec
Episode 2395, loss:2.1802, succeed, steps:37, total reward:3.9000, 0.0754096508026123 sec
Episode 2396, loss:2.8057, succeed, steps:60, total reward:8.3000, 0.12028193473815918 sec
Episode 2397, loss:1.0421, succeed, steps:44, total reward:4.7000, 0.08981943130493164 sec
Episode 2398, loss:7.5468, fail, steps:93, total reward:9.3000, 0.18549370765686035 sec
Episode 2399, loss:3.1394, succeed, steps:53, total reward:7.5000, 0.11077165603637695 sec
Episode 2400, loss:2.8209, succeed, steps:54, total reward:8.5000, 0.10982775688171387 sec
Episode 2401, loss:1.1559, succeed, steps:45, total reward:5.7000, 0.09143209457397461 sec
Epis

Episode 2483, loss:0.0735, succeed, steps:34, total reward:7.1000, 0.0695352554321289 sec
Episode 2484, loss:0.4294, succeed, steps:46, total reward:9.8000, 0.09068059921264648 sec
Episode 2485, loss:0.2662, succeed, steps:32, total reward:5.1000, 0.06408166885375977 sec
Episode 2486, loss:0.6397, succeed, steps:40, total reward:6.9000, 0.07942509651184082 sec
Episode 2487, loss:2.3395, succeed, steps:50, total reward:7.6000, 0.09868359565734863 sec
Episode 2488, loss:2.7711, succeed, steps:83, total reward:9.6000, 0.1664116382598877 sec
Episode 2489, loss:0.1448, succeed, steps:28, total reward:4.2000, 0.05973958969116211 sec
Episode 2490, loss:5.3696, succeed, steps:30, total reward:6.2000, 0.059729576110839844 sec
Episode 2491, loss:2.7043, succeed, steps:69, total reward:8.0000, 0.13661599159240723 sec
Episode 2492, loss:1.2822, succeed, steps:54, total reward:5.4000, 0.10909223556518555 sec
Episode 2493, loss:1.1454, succeed, steps:35, total reward:5.0000, 0.07106637954711914 sec


Episode 2575, loss:2.3676, succeed, steps:35, total reward:5.0000, 0.06920647621154785 sec
Episode 2576, loss:2.6848, succeed, steps:50, total reward:4.5000, 0.09859919548034668 sec
Episode 2577, loss:1.8544, succeed, steps:57, total reward:8.4000, 0.11389422416687012 sec
Episode 2578, loss:0.6320, succeed, steps:86, total reward:6.1000, 0.17316484451293945 sec
Episode 2579, loss:-1.6055, succeed, steps:88, total reward:5.3000, 0.1808607578277588 sec
Episode 2580, loss:3.5058, succeed, steps:77, total reward:9.8000, 0.15932965278625488 sec
Episode 2581, loss:1.8227, succeed, steps:43, total reward:6.8000, 0.0953989028930664 sec
Episode 2582, loss:-0.0577, succeed, steps:55, total reward:3.3000, 0.1132211685180664 sec
Episode 2583, loss:0.6444, succeed, steps:18, total reward:3.5000, 0.03947710990905762 sec
Episode 2584, loss:2.4524, succeed, steps:49, total reward:6.6000, 0.09778165817260742 sec
Episode 2585, loss:1.4696, succeed, steps:57, total reward:5.3000, 0.11618542671203613 sec


Episode 2668, loss:1.0404, succeed, steps:51, total reward:8.6000, 0.10177040100097656 sec
Episode 2669, loss:0.7218, succeed, steps:60, total reward:8.3000, 0.1203000545501709 sec
Episode 2670, loss:3.5973, succeed, steps:23, total reward:5.4000, 0.04832029342651367 sec
Episode 2671, loss:0.3395, succeed, steps:61, total reward:6.2000, 0.12071919441223145 sec
Episode 2672, loss:2.3075, succeed, steps:47, total reward:7.7000, 0.09470248222351074 sec
Episode 2673, loss:3.1511, succeed, steps:74, total reward:3.7000, 0.14890050888061523 sec
Episode 2674, loss:2.6577, succeed, steps:74, total reward:9.9000, 0.19836854934692383 sec
Episode 2675, loss:4.5475, succeed, steps:60, total reward:8.3000, 0.12452244758605957 sec
Episode 2676, loss:1.6676, succeed, steps:30, total reward:6.2000, 0.0628652572631836 sec
Episode 2677, loss:0.9503, succeed, steps:54, total reward:5.4000, 0.10682010650634766 sec
Episode 2678, loss:2.4679, succeed, steps:22, total reward:4.4000, 0.0465695858001709 sec
Ep

Episode 2761, loss:4.4748, succeed, steps:68, total reward:10.1000, 0.1344437599182129 sec
Episode 2762, loss:3.9120, succeed, steps:66, total reward:8.1000, 0.13325166702270508 sec
Episode 2763, loss:0.2975, succeed, steps:42, total reward:5.7000, 0.08684802055358887 sec
Episode 2764, loss:0.4149, succeed, steps:68, total reward:10.1000, 0.1358504295349121 sec
Episode 2765, loss:1.7331, succeed, steps:34, total reward:4.0000, 0.0706624984741211 sec
Episode 2766, loss:5.8919, succeed, steps:75, total reward:10.9000, 0.14942073822021484 sec
Episode 2767, loss:1.6639, succeed, steps:30, total reward:3.1000, 0.06349444389343262 sec
Episode 2768, loss:2.4149, succeed, steps:46, total reward:6.7000, 0.09129190444946289 sec
Episode 2769, loss:3.9070, succeed, steps:50, total reward:10.7000, 0.09955096244812012 sec
Episode 2770, loss:2.7712, fail, steps:92, total reward:5.2000, 0.18965888023376465 sec
Episode 2771, loss:3.1110, succeed, steps:68, total reward:7.0000, 0.14104199409484863 sec
E

Episode 2853, loss:2.5531, succeed, steps:34, total reward:4.0000, 0.06789994239807129 sec
Episode 2854, loss:1.7602, succeed, steps:63, total reward:8.2000, 0.12382864952087402 sec
Episode 2855, loss:0.9241, succeed, steps:49, total reward:6.6000, 0.09921526908874512 sec
Episode 2856, loss:0.6141, succeed, steps:55, total reward:6.4000, 0.11025381088256836 sec
Episode 2857, loss:1.1362, succeed, steps:64, total reward:9.2000, 0.13257336616516113 sec
Episode 2858, loss:0.7963, succeed, steps:58, total reward:6.3000, 0.11873555183410645 sec
Episode 2859, loss:1.1039, succeed, steps:45, total reward:8.8000, 0.09209299087524414 sec
Episode 2860, loss:0.2578, succeed, steps:34, total reward:4.0000, 0.06881022453308105 sec
Episode 2861, loss:3.9906, succeed, steps:61, total reward:9.3000, 0.12113690376281738 sec
Episode 2862, loss:0.2745, fail, steps:91, total reward:3.2000, 0.1834254264831543 sec
Episode 2863, loss:0.7242, succeed, steps:66, total reward:8.1000, 0.13668417930603027 sec
Epi

Episode 2944, loss:2.4111, succeed, steps:48, total reward:5.6000, 0.0946199893951416 sec
Episode 2945, loss:0.4139, succeed, steps:58, total reward:9.4000, 0.11507534980773926 sec
Episode 2946, loss:0.1084, succeed, steps:18, total reward:3.5000, 0.03839683532714844 sec
Episode 2947, loss:1.9098, succeed, steps:82, total reward:5.5000, 0.16187334060668945 sec
Episode 2948, loss:0.6018, succeed, steps:37, total reward:7.0000, 0.07664608955383301 sec
Episode 2949, loss:1.1961, succeed, steps:42, total reward:5.8000, 0.08379459381103516 sec
Episode 2950, loss:1.7151, succeed, steps:43, total reward:6.8000, 0.08643627166748047 sec
Episode 2951, loss:0.2418, succeed, steps:32, total reward:5.1000, 0.06410765647888184 sec
Episode 2952, loss:0.1937, succeed, steps:35, total reward:5.0000, 0.06923127174377441 sec
Episode 2953, loss:2.1395, succeed, steps:81, total reward:7.6000, 0.16085457801818848 sec
Episode 2954, loss:2.0219, succeed, steps:61, total reward:9.1000, 0.125030517578125 sec
Ep

Episode 3034, loss:2.1858, succeed, steps:70, total reward:9.0000, 0.14014196395874023 sec
Episode 3035, loss:-0.0014, succeed, steps:67, total reward:6.0000, 0.1361558437347412 sec
Episode 3036, loss:2.5711, succeed, steps:47, total reward:7.7000, 0.09713339805603027 sec
Episode 3037, loss:2.4627, succeed, steps:74, total reward:9.9000, 0.14835667610168457 sec
Episode 3038, loss:0.5955, succeed, steps:54, total reward:8.5000, 0.11305069923400879 sec
Episode 3039, loss:4.3694, succeed, steps:41, total reward:4.8000, 0.08572030067443848 sec
Episode 3040, loss:0.0916, succeed, steps:31, total reward:4.1000, 0.06260085105895996 sec
Episode 3041, loss:0.2165, succeed, steps:35, total reward:5.0000, 0.06965446472167969 sec
Episode 3042, loss:9.7426, succeed, steps:84, total reward:10.6000, 0.16527366638183594 sec
Episode 3043, loss:1.8701, succeed, steps:92, total reward:9.3000, 0.1883983612060547 sec
Episode 3044, loss:-2.6840, succeed, steps:52, total reward:7.6000, 0.10903263092041016 se

Episode 3126, loss:1.5094, succeed, steps:58, total reward:6.3000, 0.12178182601928711 sec
Episode 3127, loss:0.2706, succeed, steps:38, total reward:4.9000, 0.07797908782958984 sec
Episode 3128, loss:2.0888, succeed, steps:71, total reward:10.0000, 0.1416616439819336 sec
Episode 3129, loss:1.7843, fail, steps:92, total reward:7.3000, 0.1871805191040039 sec
Episode 3130, loss:-0.5342, fail, steps:91, total reward:3.2000, 0.18950319290161133 sec
Episode 3131, loss:1.3252, succeed, steps:90, total reward:10.4000, 0.19000816345214844 sec
Episode 3132, loss:2.2986, succeed, steps:32, total reward:5.1000, 0.07078671455383301 sec
Episode 3133, loss:-1.1034, fail, steps:91, total reward:3.2000, 0.18163418769836426 sec
Episode 3134, loss:-3.0250, fail, steps:91, total reward:1.1000, 0.18787217140197754 sec
Episode 3135, loss:2.2384, succeed, steps:84, total reward:7.5000, 0.17412877082824707 sec
Episode 3136, loss:-0.5205, fail, steps:91, total reward:2.2000, 0.18904900550842285 sec
Episode 31

Episode 3217, loss:1.1054, succeed, steps:30, total reward:3.1000, 0.06467413902282715 sec
Episode 3218, loss:0.3014, succeed, steps:40, total reward:6.9000, 0.08011436462402344 sec
Episode 3219, loss:2.3320, succeed, steps:46, total reward:9.8000, 0.09093642234802246 sec
Episode 3220, loss:0.5138, succeed, steps:29, total reward:5.2000, 0.05858349800109863 sec
Episode 3221, loss:0.5394, succeed, steps:50, total reward:7.6000, 0.0976865291595459 sec
Episode 3222, loss:4.9243, succeed, steps:21, total reward:3.4000, 0.043160200119018555 sec
Episode 3223, loss:0.0100, succeed, steps:19, total reward:4.5000, 0.03806591033935547 sec
Episode 3224, loss:2.8677, succeed, steps:49, total reward:9.7000, 0.09509611129760742 sec
Episode 3225, loss:0.0156, succeed, steps:30, total reward:6.2000, 0.05987238883972168 sec
Episode 3226, loss:0.2339, succeed, steps:24, total reward:3.3000, 0.04776406288146973 sec
Episode 3227, loss:5.6927, fail, steps:93, total reward:9.3000, 0.1824512481689453 sec
Epi

Episode 3309, loss:0.2215, succeed, steps:33, total reward:6.0000, 0.06923055648803711 sec
Episode 3310, loss:0.0326, succeed, steps:52, total reward:9.6000, 0.10345816612243652 sec
Episode 3311, loss:0.1588, succeed, steps:26, total reward:5.2000, 0.053050994873046875 sec
Episode 3312, loss:2.4960, succeed, steps:75, total reward:7.5000, 0.14835596084594727 sec
Episode 3313, loss:1.0492, succeed, steps:69, total reward:11.1000, 0.1399850845336914 sec
Episode 3314, loss:0.5396, succeed, steps:29, total reward:5.2000, 0.06096625328063965 sec
Episode 3315, loss:0.6239, succeed, steps:41, total reward:4.6000, 0.08147072792053223 sec
Episode 3316, loss:1.5158, succeed, steps:60, total reward:11.4000, 0.11860847473144531 sec
Episode 3317, loss:0.7059, succeed, steps:40, total reward:6.9000, 0.08144664764404297 sec
Episode 3318, loss:0.4617, succeed, steps:34, total reward:4.0000, 0.06814861297607422 sec
Episode 3319, loss:0.5071, succeed, steps:37, total reward:7.0000, 0.07287740707397461 s

Episode 3401, loss:1.0614, succeed, steps:76, total reward:8.8000, 0.1512305736541748 sec
Episode 3402, loss:1.7439, succeed, steps:49, total reward:6.6000, 0.10010695457458496 sec
Episode 3403, loss:0.6106, succeed, steps:66, total reward:11.2000, 0.13294529914855957 sec
Episode 3404, loss:3.5698, succeed, steps:40, total reward:6.7000, 0.08264946937561035 sec
Episode 3405, loss:4.1270, succeed, steps:49, total reward:9.7000, 0.10191702842712402 sec
Episode 3406, loss:2.9843, succeed, steps:74, total reward:9.8000, 0.15154194831848145 sec
Episode 3407, loss:0.0531, succeed, steps:41, total reward:7.9000, 0.08608412742614746 sec
Episode 3408, loss:0.6033, succeed, steps:46, total reward:6.7000, 0.09182953834533691 sec
Episode 3409, loss:1.3192, succeed, steps:21, total reward:3.4000, 0.04417133331298828 sec
Episode 3410, loss:0.3977, succeed, steps:41, total reward:7.9000, 0.08024787902832031 sec
Episode 3411, loss:0.2293, succeed, steps:44, total reward:7.8000, 0.0865316390991211 sec


Episode 3492, loss:0.2742, succeed, steps:48, total reward:8.7000, 0.0944221019744873 sec
Episode 3493, loss:0.4376, succeed, steps:60, total reward:11.3000, 0.11894917488098145 sec
Episode 3494, loss:0.0015, succeed, steps:30, total reward:6.2000, 0.06326603889465332 sec
Episode 3495, loss:0.2108, succeed, steps:29, total reward:5.2000, 0.060564279556274414 sec
Episode 3496, loss:0.1755, succeed, steps:39, total reward:5.9000, 0.07774686813354492 sec
Episode 3497, loss:3.2422, succeed, steps:52, total reward:9.6000, 0.10408568382263184 sec
Episode 3498, loss:0.0077, succeed, steps:22, total reward:4.4000, 0.04673600196838379 sec
Episode 3499, loss:0.1546, succeed, steps:41, total reward:7.9000, 0.08138751983642578 sec
Episode 3500, loss:1.5670, succeed, steps:56, total reward:10.5000, 0.11052179336547852 sec
Episode 3501, loss:0.0021, succeed, steps:32, total reward:5.1000, 0.06695795059204102 sec
Episode 3502, loss:1.8863, succeed, steps:61, total reward:9.3000, 0.12244415283203125 s

Episode 3585, loss:2.0754, succeed, steps:69, total reward:8.0000, 0.13620829582214355 sec
Episode 3586, loss:-0.3506, fail, steps:92, total reward:5.2000, 0.18532252311706543 sec
Episode 3587, loss:2.6965, succeed, steps:45, total reward:8.8000, 0.09481215476989746 sec
Episode 3588, loss:0.2004, succeed, steps:28, total reward:4.2000, 0.05794262886047363 sec
Episode 3589, loss:-0.3384, fail, steps:91, total reward:3.2000, 0.17989063262939453 sec
Episode 3590, loss:0.4016, succeed, steps:27, total reward:3.2000, 0.0585024356842041 sec
Episode 3591, loss:0.1513, succeed, steps:61, total reward:6.2000, 0.120361328125 sec
Episode 3592, loss:-0.3016, fail, steps:91, total reward:3.2000, 0.1827843189239502 sec
Episode 3593, loss:0.0698, succeed, steps:42, total reward:5.8000, 0.08838343620300293 sec
Episode 3594, loss:2.0291, succeed, steps:87, total reward:7.4000, 0.17507457733154297 sec
Episode 3595, loss:0.0218, succeed, steps:73, total reward:5.8000, 0.15043926239013672 sec
Episode 3596

Episode 3677, loss:0.0016, succeed, steps:36, total reward:2.8000, 0.07343602180480957 sec
Episode 3678, loss:6.3664, succeed, steps:87, total reward:7.4000, 0.1722729206085205 sec
Episode 3679, loss:0.4637, succeed, steps:49, total reward:6.6000, 0.10179662704467773 sec
Episode 3680, loss:0.3902, succeed, steps:37, total reward:7.0000, 0.07545232772827148 sec
Episode 3681, loss:-0.0375, fail, steps:91, total reward:2.2000, 0.1831064224243164 sec
Episode 3682, loss:1.8845, succeed, steps:77, total reward:9.8000, 0.15955138206481934 sec
Episode 3683, loss:5.6643, succeed, steps:40, total reward:6.9000, 0.08374977111816406 sec
Episode 3684, loss:0.0885, fail, steps:91, total reward:4.2000, 0.1817033290863037 sec
Episode 3685, loss:1.0803, succeed, steps:75, total reward:7.8000, 0.15526533126831055 sec
Episode 3686, loss:0.5576, succeed, steps:44, total reward:7.8000, 0.0913999080657959 sec
Episode 3687, loss:0.0046, succeed, steps:22, total reward:4.4000, 0.04607677459716797 sec
Episode 

Episode 3769, loss:0.2740, succeed, steps:36, total reward:6.0000, 0.07221460342407227 sec
Episode 3770, loss:1.7875, succeed, steps:18, total reward:3.5000, 0.03718829154968262 sec
Episode 3771, loss:0.1520, succeed, steps:43, total reward:6.8000, 0.08314657211303711 sec
Episode 3772, loss:0.4918, succeed, steps:45, total reward:8.8000, 0.08919906616210938 sec
Episode 3773, loss:0.2563, succeed, steps:73, total reward:8.9000, 0.14479732513427734 sec
Episode 3774, loss:1.2457, succeed, steps:40, total reward:6.9000, 0.08240771293640137 sec
Episode 3775, loss:0.5271, succeed, steps:39, total reward:5.9000, 0.07897424697875977 sec
Episode 3776, loss:0.2181, succeed, steps:52, total reward:9.6000, 0.10260581970214844 sec
Episode 3777, loss:0.1876, succeed, steps:37, total reward:6.9000, 0.07527780532836914 sec
Episode 3778, loss:0.0005, succeed, steps:25, total reward:4.2000, 0.050513267517089844 sec
Episode 3779, loss:1.4978, succeed, steps:68, total reward:10.1000, 0.13265609741210938 s

Episode 3860, loss:0.0115, succeed, steps:34, total reward:7.1000, 0.06757068634033203 sec
Episode 3861, loss:2.7087, succeed, steps:63, total reward:11.3000, 0.12437844276428223 sec
Episode 3862, loss:0.1060, succeed, steps:36, total reward:6.0000, 0.07306194305419922 sec
Episode 3863, loss:-0.4162, fail, steps:92, total reward:6.0000, 0.18298053741455078 sec
Episode 3864, loss:0.1987, succeed, steps:33, total reward:6.1000, 0.0702199935913086 sec
Episode 3865, loss:1.8348, succeed, steps:34, total reward:7.1000, 0.06810903549194336 sec
Episode 3866, loss:0.2334, succeed, steps:55, total reward:9.5000, 0.1088719367980957 sec
Episode 3867, loss:0.8228, succeed, steps:44, total reward:7.8000, 0.08847904205322266 sec
Episode 3868, loss:1.1730, succeed, steps:70, total reward:5.9000, 0.13976144790649414 sec
Episode 3869, loss:0.6291, succeed, steps:38, total reward:8.0000, 0.07833123207092285 sec
Episode 3870, loss:1.0721, succeed, steps:33, total reward:6.1000, 0.06653428077697754 sec
Ep

Episode 3952, loss:1.1308, succeed, steps:85, total reward:11.2000, 0.1694941520690918 sec
Episode 3953, loss:0.0010, succeed, steps:75, total reward:4.7000, 0.15420174598693848 sec
Episode 3954, loss:-0.1368, fail, steps:91, total reward:3.2000, 0.18852996826171875 sec
Episode 3955, loss:1.2644, succeed, steps:51, total reward:8.6000, 0.1066286563873291 sec
Episode 3956, loss:-0.0333, succeed, steps:69, total reward:4.8000, 0.1432936191558838 sec
Episode 3957, loss:1.6413, fail, steps:92, total reward:6.3000, 0.1902623176574707 sec
Episode 3958, loss:1.1028, succeed, steps:61, total reward:9.3000, 0.12848997116088867 sec
Episode 3959, loss:0.6044, succeed, steps:44, total reward:7.8000, 0.0925588607788086 sec
Episode 3960, loss:-0.0014, succeed, steps:86, total reward:3.3000, 0.1742253303527832 sec
Episode 3961, loss:0.9283, succeed, steps:61, total reward:9.3000, 0.12766265869140625 sec
Episode 3962, loss:0.0008, succeed, steps:25, total reward:4.2000, 0.05293440818786621 sec
Episode

Episode 4042, loss:0.3681, succeed, steps:85, total reward:11.6000, 0.1768476963043213 sec
Episode 4043, loss:0.0372, succeed, steps:33, total reward:6.1000, 0.07088184356689453 sec
Episode 4044, loss:0.0943, succeed, steps:38, total reward:8.0000, 0.07725119590759277 sec
Episode 4045, loss:0.5152, succeed, steps:50, total reward:7.6000, 0.09942913055419922 sec
Episode 4046, loss:3.8997, succeed, steps:64, total reward:6.1000, 0.12707924842834473 sec
Episode 4047, loss:0.0067, succeed, steps:15, total reward:3.6000, 0.033327579498291016 sec
Episode 4048, loss:0.0004, succeed, steps:26, total reward:5.3000, 0.051354169845581055 sec
Episode 4049, loss:0.0296, succeed, steps:38, total reward:8.0000, 0.07392644882202148 sec
Episode 4050, loss:0.0490, succeed, steps:42, total reward:5.8000, 0.08231210708618164 sec
Episode 4051, loss:0.1942, succeed, steps:28, total reward:4.2000, 0.05644965171813965 sec
Episode 4052, loss:2.4492, succeed, steps:36, total reward:6.0000, 0.07084441184997559 s