In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler
import math
import random

NUM_POSITIONS = 16

MIN_LIST_LEN = 4
MAX_LIST_LEN = 4
MAX_STEPS = 40

SUCCESS_REWARD = 0.5
STEP_REWARD = -0.3
COMPARISON_ENTROPY_MULTIPLIER = -0.00
SWAP_REWARD = 1.0
INVALID_ACTION_REWARD = -10.0
LONGTERM_GAMMA = 0.99
SHORTTERM_GAMMA = 0.7

EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
LR_SCHEDULER_GAMMA = 0.93
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = '/home/mcwave/code/autocode/datasets/rl_sort_transformer_curriculum/list4_transformer4_192_gamma07_step40_v1'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    'less': 2,
    'equal': 3,
    'more': 4,
    '0': 5,
    '1': 6,
    '2': 7,
    '3': 8,
    '4': 9,
    '5': 10,
    '6': 11,
    '7': 12,
    '8': 13,
    '9': 14,
    '10': 15,
    '11': 16,
    '12': 17,
    '13': 18,
    '14': 19,
    '15': 20,
    'len1': 21,
    'len2': 22,
    'len3': 23,
    'len4': 24,
    'len5': 25,
    'len6': 26,
    'len7': 27,
    'len8': 28,
    'len9': 29,
    'len10': 30,
    'len11': 31,
    'len12': 32,
    'len13': 33,
    'len14': 34,
    'len15': 35,
    'len16': 36,
    'start0': 37,
    'start1': 38,
    'start2': 39,
    'start3': 40,
    'start4': 41,
    'start5': 42,
    'start6': 43,
    'start7': 44,
    'start8': 45,
    'start9': 46,
    'start10': 47,
    'start11': 48,
    'start12': 49,
    'start13': 50,
    'start14': 51,
    'start15': 52,
}
inv_vocab = {v: k for k, v in vocab.items()}

def compute_entropy(N, alpha=1):
    K = 2**N
    values = np.arange(K)
    unnormalized_probs = np.exp(-alpha * values)
    Z = unnormalized_probs.sum()
    probs = unnormalized_probs / Z
    return values, -np.log2(probs)

_, int_entropy = compute_entropy(4)

def get_entropy_of_integer(x):
    x = min(15, abs(x))
    return int_entropy[x]

def compute_min_delta_entropy(comparisons):
    # Initialize the result list to store minDelta values
    min_delta = None

    # Iterate through each pair in the comparisons list
    i = len(comparisons) - 1
    xi, yi = comparisons[i]
    if i == 0:
        # For i = 0, use the first case directly
        min_delta = (xi, min(yi, yi - xi), 0)
    else:
        # For i > 0, compute all possible options and select the minimal one
        options = []

        # Simple Entropy
        simple_entropy = (xi, min(yi, yi - xi), 0)
        options.append(simple_entropy)

        # First Delta Entropy
        xi_prev, yi_prev = comparisons[i - 1]
        first_delta_entropy = (xi - xi_prev, yi - yi_prev, 0)
        options.append(first_delta_entropy)

        # Second Delta Entropy (only valid for i > 1)
        if i > 1:
            xi_prev2, yi_prev2 = comparisons[i - 2]
            second_delta_entropy = (
                (xi - xi_prev) - (xi_prev - xi_prev2),
                (yi - yi_prev) - (yi_prev - yi_prev2),
                0,
            )
            options.append(second_delta_entropy)

        # Arbitrary Position Entropy (only valid for i > 1)
        for j in range(i):
            xj, yj = comparisons[j]
            arbitrary_position_entropy = (
                xi - xj,
                yi - yj,
                min(j, i - j),
            )
            options.append(arbitrary_position_entropy)

        # Find the option with the minimal sum
        min_delta = min(options, key=lambda t: sum([get_entropy_of_integer(x) for x in t]))

    entropy = sum([get_entropy_of_integer(x) for x in min_delta])
    if len(comparisons) == 1:
        return 3 * entropy
    else:
        return entropy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        length_token = 'len{}'.format(self.length)
        return vocab[length_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_length(self):
        return self.length

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                print("Error! Comparison without 2 indices!")
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 < 0 or index1 >= self.length or index2 < 0 or index2 >= self.length:
                print(f"Error! Comparison with invalid indices {index1} {index2}")
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = STEP_REWARD
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = STEP_REWARD * 2
            else:
                response_token = vocab['more']
                reward = STEP_REWARD
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = SWAP_REWARD
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -SWAP_REWARD
            else:
                reward = STEP_REWARD
            self.indices = None
        else:
            reward = INVALID_ACTION_REWARD
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=192, nhead=8, num_layers=4):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [3]:
def compute_bellman_returns(raw_rewards, gamma):
    bellman_returns = []
    R = 0
    for r in raw_rewards[::-1]:
        R = r + gamma * R
        bellman_returns.insert(0, R)
    return bellman_returns

# Training Loop
def train(verbose=False):
    # Removed torch.autograd.set_detect_anomaly(True)
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=LR_SCHEDULER_GAMMA)
    
    # Optionally, load a checkpoint
    checkpoint_path = os.path.join("/home/mcwave/code/autocode/datasets/rl_sort_transformer_curriculum/list2_transformer4_192_gamma07_step10_v1/ckpt_10000_1.0000_4.00.pth")
    load_checkpoint(checkpoint_path, model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    num_successes = 0
    total_steps = 0
    
    for episode in range(NUM_EPISODES):
        t1 = time.time()
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, start_pos, current_list = env.reset()
        input_tokens = [initial_token_id, start_pos]
        log_probs = []
        rewards = []
        comparisons = []
        
        state = 'expect_action'
        done = False
        success = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
                #print(comparisons)
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
                output = model(input_seq)  # (seq_len, batch_size, vocab_size)
                # Get logits for the last token
                logits = output[-1, 0, :]  # (vocab_size)

                # Check for NaNs in logits
                if torch.isnan(logits).any():
                    print(f"Episode {episode}, NaNs in logits before masking.")
                    break

                # Get valid tokens based on state
                def get_valid_tokens(state):
                    action_tokens = [vocab['Comparison'], vocab['Swap']]
                    index_tokens = [vocab[str(i)] for i in range(env.get_start_pos(), env.get_start_pos() + env.get_length())]
                    if state == 'expect_action':
                        return action_tokens
                    elif state == 'expect_index1':
                        return index_tokens[:-1]
                    elif state == 'expect_index2':
                        return [x for x in index_tokens if x > input_tokens[-1]]
                    else:
                        # Handle unexpected states by defaulting to expect_action
                        return action_tokens

                valid_token_ids = get_valid_tokens(state)

                # Ensure valid_token_ids are within the vocab range
                if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                    print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                    break

                # Mask invalid tokens
                mask_value = -1e9  # Use a large negative value instead of -inf
                mask = torch.full_like(logits, mask_value).to(device)
                mask[valid_token_ids] = 0
                masked_logits = logits + mask

                # Sample action. Have some chance to randomly pick a valid action.
                eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
                if random.random() < eps_threshold:
                    masked_logits = masked_logits / 4

                # Check for NaNs in masked_logits
                if torch.isnan(masked_logits).any():
                    print(f"Episode {episode}, NaNs in masked_logits after masking.")
                    break

                # Compute probabilities
                probs = F.softmax(masked_logits, dim=0)

                # Check for NaNs in probs
                if torch.isnan(probs).any():
                    print(f"Episode {episode}, NaNs in probs after softmax.")
                    break

                try:
                    m = torch.distributions.Categorical(probs)
                    action_token = m.sample()
                    log_prob = m.log_prob(action_token)
                except ValueError as e:
                    print(f"Episode {episode}, error in sampling action: {e}")
                    break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        reward = INVALID_ACTION_REWARD
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if done and reward == SUCCESS_REWARD:
                        success = True
                        if episode % 100 == 0:
                            print(decode(input_tokens, inv_vocab))
                    if verbose:
                        print("Reward:", reward)
                    state = 'expect_action'
                else:
                    reward = INVALID_ACTION_REWARD
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                comparisons.append((int(inv_vocab[index1_token.item()]), int(inv_vocab[index2_token.item()])))
                response_token, reward, done, current_list = env.step(action_tokens)
                if done and reward == SUCCESS_REWARD:
                    success = True
                    if episode % 100 == 0:
                        print(1, decode(input_tokens, inv_vocab))
                else:
                    pass
                    #reward += COMPARISON_ENTROPY_MULTIPLIER * compute_min_delta_entropy(comparisons)
                if verbose:
                    print("Reward:", reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                reward = INVALID_ACTION_REWARD
                done = True

            rewards.append(reward)
        #
        success_rewards = [0.0] * len(rewards)
        if success: 
            num_successes += 1
            success_rewards[-1] = SUCCESS_REWARD

        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / episode_cnt
            success_rate = num_successes / episode_cnt
            avg_steps = total_steps / episode_cnt
            episode_cnt = 0
            total_reward = 0.0
            num_successes = 0
            total_steps = 0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{success_rate:.4f}_{avg_steps:.2f}.pth")
            #
            # Reduce the lr
            scheduler.step()
            # Optionally, log the learning rate
            current_lr = scheduler.get_last_lr()[0]
            print(f"Learning rate = {current_lr:.6f}")
        #
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"

        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns and loss within autocast
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            # Compute returns
            returns1 = compute_bellman_returns(rewards, SHORTTERM_GAMMA)
            returns2 = compute_bellman_returns(success_rewards, LONGTERM_GAMMA)
            returns = torch.tensor(np.array(returns1) + np.array(returns2)).to(device)

            # Check for NaNs in returns
            if torch.isnan(returns).any():
                print(f"Episode {episode}, NaNs in returns.")
                continue

            # Compute loss
            loss = 0
            for log_prob, R in zip(log_probs, returns):
                loss -= log_prob * R

            # Check for NaNs in loss
            if torch.isnan(loss):
                print(f"Episode {episode}, NaN in loss.")
                continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        total_steps += len(rewards)
        t2 = time.time()
        if episode % 1 == 0:
            print(f"Episode {episode}, loss:{loss.item():.4f}, {'succeed' if success else 'fail'}, steps:{len(rewards)}, total reward:{sum(rewards):.4f}, {t2-t1} sec")

if __name__ == "__main__":
    train(verbose=False)




Checkpoint loaded from /home/mcwave/code/autocode/datasets/rl_sort_transformer_curriculum/list2_transformer4_192_gamma07_step10_v1/ckpt_10000_1.0000_4.00.pth, resuming from episode 10000
Episode 0, loss:-9.2804, fail, steps:5, total reward:-11.3000, 0.38872671127319336 sec
Episode 1, loss:-4.3093, fail, steps:5, total reward:-9.3000, 0.01579427719116211 sec
Episode 2, loss:-9.4230, fail, steps:5, total reward:-11.3000, 0.013831377029418945 sec
Episode 3, loss:-9.1705, fail, steps:5, total reward:-11.3000, 0.013719558715820312 sec
Episode 4, loss:-2.7957, fail, steps:5, total reward:-9.3000, 0.013746976852416992 sec
Episode 5, loss:-8.9026, fail, steps:5, total reward:-9.3000, 0.013704299926757812 sec
Episode 6, loss:-4.4113, fail, steps:5, total reward:-11.3000, 0.013674020767211914 sec
Episode 7, loss:-5.9139, fail, steps:5, total reward:-9.3000, 0.01370692253112793 sec
Episode 8, loss:-9.5696, fail, steps:5, total reward:-9.3000, 0.01372671127319336 sec
Episode 9, loss:-7.9788, fail,

Episode 92, loss:-0.2066, fail, steps:30, total reward:1.6000, 0.07510733604431152 sec
Episode 93, loss:-1.8699, fail, steps:29, total reward:-0.7000, 0.0731196403503418 sec
Episode 94, loss:1.5858, succeed, steps:16, total reward:-1.0000, 0.0972754955291748 sec
Episode 95, loss:-3.1116, fail, steps:29, total reward:-1.7000, 0.0740506649017334 sec
Episode 96, loss:-1.8908, fail, steps:29, total reward:-0.7000, 0.07322478294372559 sec
Episode 97, loss:3.7184, succeed, steps:27, total reward:0.1000, 0.06854963302612305 sec
Episode 98, loss:-4.2276, fail, steps:29, total reward:-2.7000, 0.0734870433807373 sec
Episode 99, loss:-2.7220, fail, steps:29, total reward:-0.7000, 0.07335662841796875 sec
len4 start12 Comparison 12 13 less Comparison 14 15 more Swap
Episode 100, loss:0.9348, succeed, steps:7, total reward:-0.1000, 0.02003026008605957 sec
Episode 101, loss:5.1438, succeed, steps:18, total reward:1.0000, 0.04533100128173828 sec
Episode 102, loss:2.0073, succeed, steps:17, total rewar

Episode 186, loss:-1.4531, fail, steps:29, total reward:-0.7000, 0.07362842559814453 sec
Episode 187, loss:2.1969, fail, steps:30, total reward:1.6000, 0.07557535171508789 sec
Episode 188, loss:1.2408, succeed, steps:7, total reward:-0.1000, 0.02017688751220703 sec
Episode 189, loss:-3.9033, fail, steps:29, total reward:-1.7000, 0.07183122634887695 sec
Episode 190, loss:2.2697, succeed, steps:14, total reward:0.3000, 0.03707265853881836 sec
Episode 191, loss:-3.5910, fail, steps:29, total reward:-1.7000, 0.07191252708435059 sec
Episode 192, loss:3.9258, succeed, steps:25, total reward:1.4000, 0.06312346458435059 sec
Episode 193, loss:-3.3953, fail, steps:29, total reward:-0.7000, 0.07341837882995605 sec
Episode 194, loss:0.2298, succeed, steps:4, total reward:0.2000, 0.012944459915161133 sec
Episode 195, loss:-0.8398, fail, steps:30, total reward:0.3000, 0.07360219955444336 sec
Episode 196, loss:-0.1020, fail, steps:30, total reward:0.3000, 0.07484555244445801 sec
Episode 197, loss:0.9

Episode 280, loss:1.8143, succeed, steps:16, total reward:-1.0000, 0.04113173484802246 sec
Episode 281, loss:-2.3233, fail, steps:30, total reward:-1.7000, 0.07416558265686035 sec
Episode 282, loss:-0.4072, fail, steps:30, total reward:0.3000, 0.07516789436340332 sec
Episode 283, loss:-1.7628, fail, steps:29, total reward:-1.3000, 0.07352209091186523 sec
Episode 284, loss:-7.3939, fail, steps:30, total reward:-1.7000, 0.07564473152160645 sec
Episode 285, loss:-3.0759, fail, steps:29, total reward:-1.7000, 0.07319068908691406 sec
Episode 286, loss:1.9485, succeed, steps:23, total reward:-0.6000, 0.058994293212890625 sec
Episode 287, loss:-2.1270, fail, steps:29, total reward:-0.7000, 0.07394599914550781 sec
Episode 288, loss:0.9905, fail, steps:30, total reward:2.6000, 0.07893872261047363 sec
Episode 289, loss:-2.1770, fail, steps:29, total reward:-0.7000, 0.07396245002746582 sec
Episode 290, loss:-3.0745, fail, steps:29, total reward:-0.7000, 0.07377910614013672 sec
Episode 291, loss:0

Episode 374, loss:-0.7457, fail, steps:30, total reward:0.3000, 0.07776570320129395 sec
Episode 375, loss:-4.0257, fail, steps:29, total reward:-1.7000, 0.07508134841918945 sec
Episode 376, loss:-0.7929, fail, steps:30, total reward:0.3000, 0.07738709449768066 sec
Episode 377, loss:0.9418, succeed, steps:7, total reward:-0.1000, 0.02057337760925293 sec
Episode 378, loss:-2.1812, fail, steps:29, total reward:-0.7000, 0.07352662086486816 sec
Episode 379, loss:1.8831, succeed, steps:11, total reward:0.6000, 0.03010869026184082 sec
Episode 380, loss:0.1342, fail, steps:30, total reward:1.6000, 0.07465696334838867 sec
Episode 381, loss:3.2881, succeed, steps:29, total reward:-1.2000, 0.07274794578552246 sec
Episode 382, loss:0.3111, fail, steps:30, total reward:1.6000, 0.07620501518249512 sec
Episode 383, loss:-2.0053, fail, steps:29, total reward:-0.7000, 0.07328414916992188 sec
Episode 384, loss:1.3262, succeed, steps:13, total reward:-0.7000, 0.03463625907897949 sec
Episode 385, loss:2.0

Episode 468, loss:1.1614, fail, steps:30, total reward:1.6000, 0.07706904411315918 sec
Episode 469, loss:-0.8164, fail, steps:30, total reward:0.3000, 0.07538366317749023 sec
Episode 470, loss:-1.3172, fail, steps:30, total reward:0.3000, 0.07551717758178711 sec
Episode 471, loss:-0.7945, fail, steps:30, total reward:0.3000, 0.0760962963104248 sec
Episode 472, loss:2.6481, succeed, steps:11, total reward:0.6000, 0.02998971939086914 sec
Episode 473, loss:5.5572, succeed, steps:23, total reward:2.7000, 0.05740785598754883 sec
Episode 474, loss:3.0053, succeed, steps:18, total reward:1.0000, 0.04576230049133301 sec
Episode 475, loss:3.6467, succeed, steps:23, total reward:-0.6000, 0.05762887001037598 sec
Episode 476, loss:-2.8106, fail, steps:29, total reward:-1.7000, 0.07290196418762207 sec
Episode 477, loss:-4.4476, fail, steps:29, total reward:-1.7000, 0.0730433464050293 sec
Episode 478, loss:2.8107, succeed, steps:18, total reward:1.0000, 0.04644513130187988 sec
Episode 479, loss:3.84

Episode 562, loss:-2.3500, fail, steps:29, total reward:-0.7000, 0.07272791862487793 sec
Episode 563, loss:-2.3089, fail, steps:29, total reward:-0.7000, 0.07284832000732422 sec
Episode 564, loss:3.3567, succeed, steps:18, total reward:1.0000, 0.04632925987243652 sec
Episode 565, loss:5.2701, succeed, steps:29, total reward:2.1000, 0.07216835021972656 sec
Episode 566, loss:4.1271, succeed, steps:28, total reward:1.1000, 0.0706326961517334 sec
Episode 567, loss:2.1323, succeed, steps:20, total reward:-0.3000, 0.051609039306640625 sec
Episode 568, loss:-0.5695, fail, steps:29, total reward:-0.7000, 0.07244086265563965 sec
Episode 569, loss:2.1466, succeed, steps:17, total reward:0.0000, 0.04417586326599121 sec
Episode 570, loss:2.6694, succeed, steps:20, total reward:-0.3000, 0.050981760025024414 sec
Episode 571, loss:5.8342, succeed, steps:29, total reward:2.1000, 0.07204937934875488 sec
Episode 572, loss:2.3506, succeed, steps:20, total reward:-0.3000, 0.05108022689819336 sec
Episode 5

Episode 656, loss:-1.5504, fail, steps:29, total reward:-0.7000, 0.07320904731750488 sec
Episode 657, loss:-3.5791, fail, steps:29, total reward:-1.7000, 0.07306289672851562 sec
Episode 658, loss:1.2397, succeed, steps:11, total reward:0.6000, 0.02978038787841797 sec
Episode 659, loss:4.5812, succeed, steps:18, total reward:1.0000, 0.04530215263366699 sec
Episode 660, loss:-0.5520, fail, steps:29, total reward:-0.7000, 0.07191634178161621 sec
Episode 661, loss:4.9966, succeed, steps:18, total reward:1.0000, 0.04649686813354492 sec
Episode 662, loss:-3.4979, fail, steps:29, total reward:-1.7000, 0.07214021682739258 sec
Episode 663, loss:2.9678, succeed, steps:16, total reward:2.3000, 0.041426658630371094 sec
Episode 664, loss:4.5689, succeed, steps:21, total reward:0.7000, 0.05326390266418457 sec
Episode 665, loss:-0.5730, fail, steps:30, total reward:0.3000, 0.07449483871459961 sec
Episode 666, loss:2.5920, succeed, steps:15, total reward:1.3000, 0.039379119873046875 sec
Episode 667, l

Episode 749, loss:-3.9465, fail, steps:30, total reward:-0.9000, 0.07551264762878418 sec
Episode 750, loss:2.4204, succeed, steps:20, total reward:-0.3000, 0.05136394500732422 sec
Episode 751, loss:0.6912, succeed, steps:4, total reward:0.2000, 0.01251220703125 sec
Episode 752, loss:-1.1329, fail, steps:30, total reward:0.3000, 0.07680392265319824 sec
Episode 753, loss:-3.9943, fail, steps:29, total reward:-1.7000, 0.07473444938659668 sec
Episode 754, loss:-1.5921, fail, steps:29, total reward:-0.7000, 0.07464861869812012 sec
Episode 755, loss:-2.9351, fail, steps:29, total reward:-1.7000, 0.07431674003601074 sec
Episode 756, loss:-1.8469, fail, steps:29, total reward:-0.7000, 0.07509279251098633 sec
Episode 757, loss:3.4220, succeed, steps:19, total reward:2.0000, 0.05009627342224121 sec
Episode 758, loss:-1.8687, fail, steps:29, total reward:-0.7000, 0.07309103012084961 sec
Episode 759, loss:3.2700, succeed, steps:24, total reward:-0.5000, 0.06082296371459961 sec
Episode 760, loss:-1

Episode 841, loss:2.6955, succeed, steps:21, total reward:0.7000, 0.05435776710510254 sec
Episode 842, loss:-1.6845, fail, steps:29, total reward:-0.7000, 0.07266736030578613 sec
Episode 843, loss:-5.1395, fail, steps:29, total reward:-2.7000, 0.07293367385864258 sec
Episode 844, loss:-0.2155, fail, steps:30, total reward:1.6000, 0.07591867446899414 sec
Episode 845, loss:-1.1622, fail, steps:29, total reward:-0.7000, 0.07319021224975586 sec
Episode 846, loss:3.1752, succeed, steps:27, total reward:0.1000, 0.06812810897827148 sec
Episode 847, loss:-1.0997, fail, steps:29, total reward:-0.7000, 0.07332897186279297 sec
Episode 848, loss:3.4422, succeed, steps:11, total reward:0.6000, 0.029901981353759766 sec
Episode 849, loss:2.1726, succeed, steps:20, total reward:-0.3000, 0.05026721954345703 sec
Episode 850, loss:1.7917, fail, steps:30, total reward:1.6000, 0.07425999641418457 sec
Episode 851, loss:-1.0569, fail, steps:30, total reward:0.3000, 0.07551074028015137 sec
Episode 852, loss:-

Episode 934, loss:-1.0321, fail, steps:29, total reward:-0.7000, 0.07310247421264648 sec
Episode 935, loss:3.3177, succeed, steps:23, total reward:-0.6000, 0.05852198600769043 sec
Episode 936, loss:1.9639, succeed, steps:17, total reward:0.0000, 0.04394340515136719 sec
Episode 937, loss:-3.1290, fail, steps:29, total reward:-1.7000, 0.07201290130615234 sec
Episode 938, loss:3.6690, succeed, steps:21, total reward:0.7000, 0.05382084846496582 sec
Episode 939, loss:2.7267, succeed, steps:21, total reward:0.7000, 0.05332779884338379 sec
Episode 940, loss:-1.1301, fail, steps:29, total reward:-0.7000, 0.07239818572998047 sec
Episode 941, loss:-3.4873, fail, steps:29, total reward:-1.7000, 0.07308697700500488 sec
Episode 942, loss:2.4983, succeed, steps:15, total reward:1.3000, 0.039762258529663086 sec
Episode 943, loss:-2.5849, fail, steps:29, total reward:-1.7000, 0.07241582870483398 sec
Episode 944, loss:-2.2789, fail, steps:29, total reward:-0.7000, 0.07254552841186523 sec
Episode 945, l

Episode 1026, loss:-0.3105, fail, steps:29, total reward:-0.7000, 0.07290101051330566 sec
Episode 1027, loss:-1.2838, fail, steps:30, total reward:0.3000, 0.07694172859191895 sec
Episode 1028, loss:2.1782, succeed, steps:17, total reward:0.0000, 0.04504871368408203 sec
Episode 1029, loss:4.2378, succeed, steps:24, total reward:0.4000, 0.06122183799743652 sec
Episode 1030, loss:1.7242, succeed, steps:17, total reward:0.0000, 0.04499244689941406 sec
Episode 1031, loss:-1.8024, fail, steps:29, total reward:-0.7000, 0.07557463645935059 sec
Episode 1032, loss:3.2325, succeed, steps:12, total reward:1.6000, 0.032470703125 sec
Episode 1033, loss:0.7062, fail, steps:30, total reward:1.6000, 0.07582306861877441 sec
Episode 1034, loss:-3.6504, fail, steps:29, total reward:-1.7000, 0.07485485076904297 sec
Episode 1035, loss:4.8963, succeed, steps:18, total reward:1.0000, 0.047556400299072266 sec
Episode 1036, loss:-2.4479, fail, steps:29, total reward:-1.7000, 0.0735330581665039 sec
Episode 1037,

Episode 1119, loss:-1.4578, fail, steps:29, total reward:-0.7000, 0.07248783111572266 sec
Episode 1120, loss:4.1063, succeed, steps:24, total reward:0.4000, 0.0609743595123291 sec
Episode 1121, loss:0.0975, fail, steps:30, total reward:1.6000, 0.07558584213256836 sec
Episode 1122, loss:-3.0790, fail, steps:29, total reward:-1.7000, 0.07341980934143066 sec
Episode 1123, loss:-1.4337, fail, steps:29, total reward:-0.7000, 0.0735468864440918 sec
Episode 1124, loss:-2.0979, fail, steps:29, total reward:-1.7000, 0.07298851013183594 sec
Episode 1125, loss:-1.3624, fail, steps:29, total reward:-0.7000, 0.07358050346374512 sec
Episode 1126, loss:-2.1942, fail, steps:29, total reward:-1.7000, 0.07322812080383301 sec
Episode 1127, loss:-0.7195, fail, steps:30, total reward:1.6000, 0.07540535926818848 sec
Episode 1128, loss:1.3366, succeed, steps:4, total reward:0.2000, 0.012821674346923828 sec
Episode 1129, loss:-0.4595, fail, steps:30, total reward:0.3000, 0.07400298118591309 sec
Episode 1130, 

Episode 1212, loss:4.8203, succeed, steps:24, total reward:0.4000, 0.06160593032836914 sec
Episode 1213, loss:1.6840, succeed, steps:17, total reward:0.0000, 0.0438532829284668 sec
Episode 1214, loss:4.1061, succeed, steps:28, total reward:1.1000, 0.0728297233581543 sec
Episode 1215, loss:1.0112, succeed, steps:8, total reward:0.9000, 0.02280712127685547 sec
Episode 1216, loss:5.6217, succeed, steps:26, total reward:2.4000, 0.06570792198181152 sec
Episode 1217, loss:2.8365, succeed, steps:18, total reward:1.0000, 0.047112226486206055 sec
Episode 1218, loss:2.0887, fail, steps:30, total reward:2.6000, 0.07596921920776367 sec
Episode 1219, loss:-1.2821, fail, steps:29, total reward:-0.7000, 0.07421183586120605 sec
Episode 1220, loss:-0.9838, fail, steps:29, total reward:-0.7000, 0.07483696937561035 sec
Episode 1221, loss:-1.8483, fail, steps:29, total reward:-0.7000, 0.0747830867767334 sec
Episode 1222, loss:1.1651, succeed, steps:11, total reward:0.6000, 0.030338525772094727 sec
Episode

Episode 1305, loss:1.0334, succeed, steps:13, total reward:-0.7000, 0.03406882286071777 sec
Episode 1306, loss:1.9907, succeed, steps:11, total reward:0.6000, 0.0288088321685791 sec
Episode 1307, loss:-3.9379, fail, steps:29, total reward:-1.7000, 0.07101988792419434 sec
Episode 1308, loss:2.2517, succeed, steps:26, total reward:-0.9000, 0.06523704528808594 sec
Episode 1309, loss:4.7157, succeed, steps:29, total reward:2.1000, 0.0732264518737793 sec
Episode 1310, loss:5.2270, succeed, steps:30, total reward:3.1000, 0.07526135444641113 sec
Episode 1311, loss:0.7260, succeed, steps:8, total reward:0.9000, 0.022476911544799805 sec
Episode 1312, loss:2.6994, succeed, steps:21, total reward:0.7000, 0.052118539810180664 sec
Episode 1313, loss:2.8364, succeed, steps:18, total reward:1.0000, 0.0457000732421875 sec
Episode 1314, loss:0.4993, succeed, steps:7, total reward:-0.1000, 0.019649982452392578 sec
Episode 1315, loss:-1.9223, fail, steps:29, total reward:-0.7000, 0.07091379165649414 sec


Episode 1398, loss:3.6549, succeed, steps:29, total reward:2.1000, 0.07599639892578125 sec
Episode 1399, loss:1.2298, succeed, steps:10, total reward:-0.4000, 0.029070138931274414 sec
len4 start4 Comparison 4 5 more Swap Comparison 6 7 less Comparison 5 6 more Swap Comparison 6 7 more Swap Comparison 4 5 more Swap
Episode 1400, loss:2.5641, succeed, steps:19, total reward:2.0000, 0.10238981246948242 sec
Episode 1401, loss:2.9685, succeed, steps:27, total reward:0.1000, 0.06739473342895508 sec
Episode 1402, loss:3.6765, succeed, steps:27, total reward:-0.5000, 0.06847500801086426 sec
Episode 1403, loss:7.0937, succeed, steps:26, total reward:2.4000, 0.06560945510864258 sec
Episode 1404, loss:0.3337, succeed, steps:8, total reward:0.9000, 0.02236628532409668 sec
Episode 1405, loss:1.5281, succeed, steps:10, total reward:-0.4000, 0.0261380672454834 sec
Episode 1406, loss:-1.3458, fail, steps:29, total reward:-0.7000, 0.07112383842468262 sec
Episode 1407, loss:1.8863, succeed, steps:14, to

Episode 1490, loss:-1.5332, fail, steps:29, total reward:-1.7000, 0.07188200950622559 sec
Episode 1491, loss:1.6391, succeed, steps:16, total reward:2.3000, 0.04165196418762207 sec
Episode 1492, loss:1.8754, succeed, steps:4, total reward:0.2000, 0.012044906616210938 sec
Episode 1493, loss:-1.9861, fail, steps:29, total reward:-1.7000, 0.07104825973510742 sec
Episode 1494, loss:0.8144, fail, steps:30, total reward:1.6000, 0.0747673511505127 sec
Episode 1495, loss:-1.1371, fail, steps:29, total reward:-2.7000, 0.07308006286621094 sec
Episode 1496, loss:3.6105, succeed, steps:27, total reward:0.1000, 0.06830167770385742 sec
Episode 1497, loss:3.8185, succeed, steps:29, total reward:2.1000, 0.07283139228820801 sec
Episode 1498, loss:3.4449, succeed, steps:22, total reward:1.7000, 0.05621004104614258 sec
Episode 1499, loss:2.5673, succeed, steps:15, total reward:1.3000, 0.039524078369140625 sec
len4 start4 Comparison 4 5 less Comparison 6 7 more Swap
Episode 1500, loss:0.7581, succeed, ste

Episode 1583, loss:1.7815, succeed, steps:24, total reward:0.4000, 0.059880971908569336 sec
Episode 1584, loss:-0.4625, fail, steps:30, total reward:0.3000, 0.0745553970336914 sec
Episode 1585, loss:-1.1295, fail, steps:29, total reward:-1.7000, 0.07373571395874023 sec
Episode 1586, loss:2.4357, succeed, steps:26, total reward:-0.9000, 0.06912541389465332 sec
Episode 1587, loss:0.3949, succeed, steps:8, total reward:0.9000, 0.022960424423217773 sec
Episode 1588, loss:0.2241, succeed, steps:4, total reward:0.2000, 0.011725425720214844 sec
Episode 1589, loss:0.4375, succeed, steps:12, total reward:1.6000, 0.030435800552368164 sec
Episode 1590, loss:2.4270, succeed, steps:27, total reward:0.1000, 0.06632161140441895 sec
Episode 1591, loss:1.7889, succeed, steps:21, total reward:0.7000, 0.05326080322265625 sec
Episode 1592, loss:0.0812, fail, steps:30, total reward:1.6000, 0.0754086971282959 sec
Episode 1593, loss:1.3747, succeed, steps:20, total reward:-0.3000, 0.051189422607421875 sec
Ep

Episode 1676, loss:-0.5796, fail, steps:30, total reward:0.3000, 0.07696890830993652 sec
Episode 1677, loss:2.1914, succeed, steps:15, total reward:1.3000, 0.03929853439331055 sec
Episode 1678, loss:0.9034, succeed, steps:16, total reward:-1.0000, 0.04094552993774414 sec
Episode 1679, loss:-0.1724, fail, steps:30, total reward:1.6000, 0.07412838935852051 sec
Episode 1680, loss:-1.5691, fail, steps:29, total reward:-0.7000, 0.07319259643554688 sec
Episode 1681, loss:3.3445, succeed, steps:22, total reward:1.7000, 0.056169986724853516 sec
Episode 1682, loss:0.4751, succeed, steps:15, total reward:1.3000, 0.03878641128540039 sec
Episode 1683, loss:2.1084, succeed, steps:11, total reward:0.6000, 0.028850793838500977 sec
Episode 1684, loss:-2.5286, fail, steps:29, total reward:-0.7000, 0.07139396667480469 sec
Episode 1685, loss:3.2356, succeed, steps:29, total reward:2.1000, 0.07276344299316406 sec
Episode 1686, loss:1.1234, succeed, steps:7, total reward:-0.1000, 0.01999640464782715 sec
Ep

Episode 1768, loss:-0.8835, fail, steps:29, total reward:-0.7000, 0.0723268985748291 sec
Episode 1769, loss:2.1983, succeed, steps:28, total reward:1.1000, 0.0705265998840332 sec
Episode 1770, loss:0.2624, succeed, steps:4, total reward:0.2000, 0.012622833251953125 sec
Episode 1771, loss:0.4650, succeed, steps:12, total reward:1.6000, 0.030599355697631836 sec
Episode 1772, loss:4.0954, succeed, steps:21, total reward:0.7000, 0.05203557014465332 sec
Episode 1773, loss:0.9061, succeed, steps:22, total reward:-1.6000, 0.055258989334106445 sec
Episode 1774, loss:-1.0065, fail, steps:29, total reward:-0.7000, 0.07238602638244629 sec
Episode 1775, loss:-1.0630, fail, steps:29, total reward:-1.7000, 0.07268071174621582 sec
Episode 1776, loss:1.9657, succeed, steps:18, total reward:1.0000, 0.046639442443847656 sec
Episode 1777, loss:2.0000, succeed, steps:25, total reward:1.4000, 0.06328654289245605 sec
Episode 1778, loss:0.5907, succeed, steps:17, total reward:0.0000, 0.04361987113952637 sec



KeyboardInterrupt

