In [1]:
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import random

MIN_LIST_LEN = 10
MAX_LIST_LEN = 10
MAX_STEPS = 330

SUCCESS_REWARD = 2.0
STEP_REWARD = -0.1
SWAP_REWARD = 1.0
INVALID_ACTION_REWARD = -10.0

EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 1000
GAMMA = 0.8
NUM_EPISODES = 100000
EPISODES_SAVE = 1000
OUTPUT_DIR = 'datasets/rl_sort_transformer_easy/list10_transformer3_128_gamma08_step330_v1'

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    'less': 2,
    'equal': 3,
    'more': 4,
    '0': 5,
    '1': 6,
    '2': 7,
    '3': 8,
    '4': 9,
    '5': 10,
    '6': 11,
    '7': 12,
    '8': 13,
    '9': 14,
    '10': 15,
    '11': 16,
    '12': 17,
    '13': 18,
    '14': 19,
    '15': 20,
    'len1': 21,
    'len2': 22,
    'len3': 23,
    'len4': 24,
    'len5': 25,
    'len6': 26,
    'len7': 27,
    'len8': 28,
    'len9': 29,
    'len10': 30,
    'len11': 31,
    'len12': 32,
    'len13': 33,
    'len14': 34,
    'len15': 35,
    'len16': 36,
}
inv_vocab = {v: k for k, v in vocab.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the environment
class SortingEnv:
    def __init__(self):
        self.max_steps = MAX_STEPS

    def reset(self):
        self.length = random.randint(MIN_LIST_LEN, MAX_LIST_LEN)
        self.list = [random.randint(1, 100) for _ in range(self.length)]
        while self.list == sorted(self.list):
            self.list = [random.randint(1, 100) for _ in range(self.length)]
        self.indices = None
        self.current_step = 0
        self.done = False
        initial_token = 'len{}'.format(self.length)
        return vocab[initial_token], self.list.copy()
    
    def get_list(self):
        return self.list
    
    def get_list_len(self):
        return len(self.list)

    def step(self, action_tokens):
        action = action_tokens[0]
        reward = -0.01  # default penalty
        response_token = None

        if action == vocab['Comparison']:
            if len(action_tokens) != 3:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1 = action_tokens[1] - vocab['0']
            index2 = action_tokens[2] - vocab['0']
            if index1 >= self.length or index2 >= self.length or index1 < 0 or index2 < 0:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            self.indices = (index1, index2)
            if self.list[index1] < self.list[index2]:
                response_token = vocab['less']
                reward = STEP_REWARD
            elif self.list[index1] == self.list[index2]:
                response_token = vocab['equal']
                reward = STEP_REWARD * 2
            else:
                response_token = vocab['more']
                reward = STEP_REWARD
        elif action == vocab['Swap']:
            if self.indices is None:
                reward = INVALID_ACTION_REWARD
                self.done = True
                return response_token, reward, self.done, self.list.copy()
            index1, index2 = self.indices
            prev_list = self.list.copy()
            self.list[index1], self.list[index2] = self.list[index2], self.list[index1]
            if self.list == sorted(self.list):
                reward = SUCCESS_REWARD
                self.done = True
            #elif prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]:
            #    reward = 0.1
            elif (index1 < index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]):
                reward = SWAP_REWARD
            elif (index1 < index2 and prev_list[index1] < prev_list[index2] and self.list[index1] >= self.list[index2]) or \
                (index1 > index2 and prev_list[index1] > prev_list[index2] and self.list[index1] <= self.list[index2]):
                reward = -SWAP_REWARD
            else:
                reward = STEP_REWARD
            self.indices = None
        else:
            reward = INVALID_ACTION_REWARD
            self.done = True

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True
        return response_token, reward, self.done, self.list.copy()


Using device: cuda


In [2]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=8, num_layers=3):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

def decode(input_tokens, inv_vocab):
    return ' '.join([inv_vocab[x] for x in input_tokens])


def save_checkpoint(model, optimizer, episode, folder, filename):
    """
    Save the model and optimizer state to the designated filepath.

    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer whose state to save.
        episode (int): The current episode number.
        filepath (str): The path where to save the checkpoint.
    """
    filepath = os.path.join(folder, filename)
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    # Save the checkpoint
    torch.save({
        'episode': episode,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filepath)
    print(f"Checkpoint saved at episode {episode} to {filepath}")

def load_checkpoint(filepath, model, optimizer):
    """
    Load the model and optimizer state from the designated filepath.

    Args:
        filepath (str): The path from where to load the checkpoint.
        model (nn.Module): The model into which to load the state_dict.
        optimizer (torch.optim.Optimizer): The optimizer into which to load the state.

    Returns:
        int: The episode number to resume from.
    """
    if os.path.isfile(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode = checkpoint['episode']
        print(f"Checkpoint loaded from {filepath}, resuming from episode {episode}")
        return episode
    else:
        print(f"No checkpoint found at {filepath}, starting from scratch.")
        return 0

In [3]:
# Training Loop
def train(verbose=False):
    # Removed torch.autograd.set_detect_anomaly(True)
    vocab_size = len(vocab)
    model = TransformerModel(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
    # Optionally, load a checkpoint
    load_checkpoint("/home/mcwave/code/autocode/datasets/rl_sort_transformer_easy/list10_transformer3_128_gamma08_step330_v1/ckpt_32000_0.8880_178.56.pth", model, optimizer)

    episode_cnt = 0
    total_reward = 0.0
    num_successes = 0
    total_steps = 0
    
    for episode in range(NUM_EPISODES):
        t1 = time.time()
        model.train()  # Set model to training mode
        env = SortingEnv()
        initial_token_id, current_list = env.reset()
        input_tokens = [initial_token_id]
        log_probs = []
        rewards = []
        
        state = 'expect_action'
        done = False
        success = False

        while not done and len(input_tokens) < env.max_steps:
            if verbose:
                print(decode(input_tokens, inv_vocab))
                print(env.get_list())
            # Prepare input tensor
            input_seq = torch.tensor(input_tokens, dtype=torch.long, device=device).unsqueeze(1)  # (seq_len, batch_size)
            # Get model output
            with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
                output = model(input_seq)  # (seq_len, batch_size, vocab_size)
                # Get logits for the last token
                logits = output[-1, 0, :]  # (vocab_size)

                # Check for NaNs in logits
                if torch.isnan(logits).any():
                    print(f"Episode {episode}, NaNs in logits before masking.")
                    break

                # Get valid tokens based on state
                def get_valid_tokens(state):
                    action_tokens = [vocab['Comparison'], vocab['Swap']]
                    index_tokens = [vocab[str(i)] for i in range(env.length)]
                    if state == 'expect_action':
                        return action_tokens
                    elif state == 'expect_index1':
                        return index_tokens[:-1]
                    elif state == 'expect_index2':
                        return [x for x in index_tokens if x > input_tokens[-1]]
                    else:
                        # Handle unexpected states by defaulting to expect_action
                        return action_tokens

                valid_token_ids = get_valid_tokens(state)

                # Ensure valid_token_ids are within the vocab range
                if any(idx >= vocab_size or idx < 0 for idx in valid_token_ids):
                    print(f"Episode {episode}, invalid indices in valid_token_ids: {valid_token_ids}")
                    break

                # Mask invalid tokens
                mask_value = -1e9  # Use a large negative value instead of -inf
                mask = torch.full_like(logits, mask_value).to(device)
                mask[valid_token_ids] = 0
                masked_logits = logits + mask

                # Sample action. Have some chance to randomly pick a valid action.
                eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * episode / EPS_DECAY)
                if random.random() < eps_threshold:
                    masked_logits = masked_logits / 4

                # Check for NaNs in masked_logits
                if torch.isnan(masked_logits).any():
                    print(f"Episode {episode}, NaNs in masked_logits after masking.")
                    break

                # Compute probabilities
                probs = F.softmax(masked_logits, dim=0)

                # Check for NaNs in probs
                if torch.isnan(probs).any():
                    print(f"Episode {episode}, NaNs in probs after softmax.")
                    break

                try:
                    m = torch.distributions.Categorical(probs)
                    action_token = m.sample()
                    log_prob = m.log_prob(action_token)
                except ValueError as e:
                    print(f"Episode {episode}, error in sampling action: {e}")
                    break

            log_probs.append(log_prob)
            input_tokens.append(action_token.item())

            action = action_token.item()
            reward = 0.0
            if state == 'expect_action':
                if action == vocab['Comparison']:
                    state = 'expect_index1'
                elif action == vocab['Swap']:
                    if env.indices is None:
                        reward = INVALID_ACTION_REWARD
                        rewards.append(reward)
                        done = True
                        continue
                    action_tokens = [vocab['Swap']]
                    response_token, reward, done, current_list = env.step(action_tokens)
                    if done and reward == SUCCESS_REWARD:
                        success = True
                    if verbose:
                        print("Reward:", reward)
                    state = 'expect_action'
                else:
                    reward = INVALID_ACTION_REWARD
                    done = True
            elif state == 'expect_index1':
                index1_token = action_token
                state = 'expect_index2'
            elif state == 'expect_index2':
                index2_token = action_token
                action_tokens = [vocab['Comparison'], index1_token.item(), index2_token.item()]
                response_token, reward, done, current_list = env.step(action_tokens)
                if done and reward == SUCCESS_REWARD:
                    success = True
                if verbose:
                    print("Reward:", reward)
                if response_token is not None:
                    input_tokens.append(response_token)
                state = 'expect_action'
            else:
                reward = INVALID_ACTION_REWARD
                done = True

            rewards.append(reward)
        #
        if success: 
            num_successes += 1
        # Save checkpoint
        if episode > 0 and episode % EPISODES_SAVE == 0:
            avg_reward = total_reward / episode_cnt
            success_rate = num_successes / episode_cnt
            avg_steps = total_steps / episode_cnt
            episode_cnt = 0
            total_reward = 0.0
            num_successes = 0
            total_steps = 0
            save_checkpoint(model, optimizer, episode, OUTPUT_DIR, f"ckpt_{episode}_{success_rate:.4f}_{avg_steps:.2f}.pth")
        #
        assert len(log_probs) == len(rewards), "log_probs and returns have different sizes!"

        if len(log_probs) == 0:
            continue  # Skip if no actions were taken

        # Compute returns and loss within autocast
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            # Compute returns
            returns = []
            R = 0
            gamma = GAMMA
            for r in rewards[::-1]:
                R = r + gamma * R
                returns.insert(0, R)
            returns = torch.tensor(returns).to(device)

            # Check for NaNs in returns
            if torch.isnan(returns).any():
                print(f"Episode {episode}, NaNs in returns.")
                continue

            # Compute loss
            loss = 0
            for log_prob, R in zip(log_probs, returns):
                loss -= log_prob * R

            # Check for NaNs in loss
            if torch.isnan(loss):
                print(f"Episode {episode}, NaN in loss.")
                continue

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        episode_cnt += 1
        total_reward += sum(rewards)
        total_steps += len(rewards)
        t2 = time.time()
        if episode % 1 == 0:
            print(f"Episode {episode}, loss:{loss.item():.4f}, {'succeed' if success else 'fail'}, steps:{len(rewards)}, total reward:{sum(rewards):.4f}, {t2-t1} sec")

if __name__ == "__main__":
    train(verbose=True)




Checkpoint loaded from /home/mcwave/code/autocode/datasets/rl_sort_transformer_easy/list10_transformer3_128_gamma08_step330_v1/ckpt_32000_0.8880_178.56.pth, resuming from episode 32000
len10
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison 2
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
Reward: -0.1
len10 Comparison 2 8 less
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison 2 8 less Comparison
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison 2 8 less Comparison 1
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
Reward: -0.1
len10 Comparison 2 8 less Comparison 1 2 less
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison 2 8 less Comparison 1 2 less Comparison
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
len10 Comparison 2 8 less Comparison 1 2 less Comparison 0
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
Reward: -0.1
len10 Comparison 2 8 less Comparison 1 2 less Comparison 0 1 more
[30, 4, 27, 42, 2, 71, 7, 24, 84, 86]
Reward: 1.0
len10 Compar

Reward: -0.1
len10 Comparison 2 8 less Comparison 1 2 less Comparison 0 1 more Swap Comparison 5 6 more Swap Comparison 4 7 less Comparison 3 4 more Swap Comparison 3 4 less Comparison 2 3 more Swap Comparison 1 2 more Swap Comparison 5 6 less Comparison 7 8 less Comparison 8 9 less Comparison 3 4 less Comparison 5 6 less Comparison 3 4 less Comparison 5 6 less Comparison 7 8 less Comparison 2 3 more Swap Comparison 1 2 less Comparison 4 5 more
[4, 2, 27, 30, 42, 7, 71, 24, 84, 86]
Reward: 1.0
len10 Comparison 2 8 less Comparison 1 2 less Comparison 0 1 more Swap Comparison 5 6 more Swap Comparison 4 7 less Comparison 3 4 more Swap Comparison 3 4 less Comparison 2 3 more Swap Comparison 1 2 more Swap Comparison 5 6 less Comparison 7 8 less Comparison 8 9 less Comparison 3 4 less Comparison 5 6 less Comparison 3 4 less Comparison 5 6 less Comparison 7 8 less Comparison 2 3 more Swap Comparison 1 2 less Comparison 4 5 more Swap
[4, 2, 27, 30, 7, 42, 71, 24, 84, 86]
len10 Comparison 2 8 l

Episode 0, loss:9.8245, succeed, steps:101, total reward:12.1000, 1.288557529449463 sec
len10
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison 2
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
Reward: -0.1
len10 Comparison 2 3 less
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison 2 3 less Comparison
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison 2 3 less Comparison 1
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 more
[26, 65, 35, 47, 24, 87, 64, 56, 87, 9]
Reward: 1.0
len10 Comparison 2 3 less Comparison 1 2 more Swap
[26, 35, 65, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison
[26, 35, 65, 47, 24, 87, 64, 56, 87, 9]
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0
[26, 35, 65, 47, 24, 87, 64, 56, 87, 9]
Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less
[26, 35, 65, 47, 24, 87

len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2
[26, 24, 35, 47, 65, 64, 87, 56, 9, 87]
Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less
[26, 24, 35, 47, 65, 64, 87, 56, 9, 87]
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less Comparison


Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less Comparison 7 8 more Swap Comparison 4 5 more Swap Comparison 3 4 less Comparison 6 7 more Swap Comparison 5 6 more Swap Comparison 2 3 less Comparison 8 9 less Comparison 5 6 less Comparison 4 5 more Swap Comparison 7 8 more Swap Comparison 3 4 more Swap Comparison 1 2 less Comparison 7 8 less Comparison 3 4 less
[26, 24, 35, 9, 47, 64, 65, 56, 87, 87]
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less Comparison 7 8 more Swap Compa

len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less Comparison 7 8 more Swap Comparison 4 5 more Swap Comparison 3 4 less Comparison 6 7 more Swap Comparison 5 6 more Swap Comparison 2 3 less Comparison 8 9 less Comparison 5 6 less Comparison 4 5 more Swap Comparison 7 8 more Swap Comparison 3 4 more Swap Comparison 1 2 less Comparison 7 8 less Comparison 3 4 less Comparison 2 3 more Swap Comparison 5 6 less Comparison 0 6 less Comparison 7 8 less Comparison 4 5 less Comparison 3 4 less Comparison 8 9 equal Comparison 7 8 less Comparison 5 6 less Comparison 8 9 equal Comparison 2 3 less Comparison 1 2 more Swap Comparison 6 7 more Swap Comparison 4
[26, 9, 24, 35, 47, 64, 56, 65, 87, 87]
Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 

Reward: -0.1
len10 Comparison 2 3 less Comparison 1 2 more Swap Comparison 0 1 less Comparison 5 6 more Swap Comparison 4 5 less Comparison 3 4 more Swap Comparison 2 3 more Swap Comparison 7 8 less Comparison 8 9 more Swap Comparison 1 2 more Swap Comparison 3 4 more Swap Comparison 5 6 less Comparison 2 3 less Comparison 7 8 more Swap Comparison 4 5 more Swap Comparison 3 4 less Comparison 6 7 more Swap Comparison 5 6 more Swap Comparison 2 3 less Comparison 8 9 less Comparison 5 6 less Comparison 4 5 more Swap Comparison 7 8 more Swap Comparison 3 4 more Swap Comparison 1 2 less Comparison 7 8 less Comparison 3 4 less Comparison 2 3 more Swap Comparison 5 6 less Comparison 0 6 less Comparison 7 8 less Comparison 4 5 less Comparison 3 4 less Comparison 8 9 equal Comparison 7 8 less Comparison 5 6 less Comparison 8 9 equal Comparison 2 3 less Comparison 1 2 more Swap Comparison 6 7 more Swap Comparison 4 5 less Comparison 3 4 less Comparison 7 8 less Comparison 1 2 less Comparison 7 8

Episode 1, loss:2.3124, succeed, steps:233, total reward:13.5000, 1.5106308460235596 sec
len10
[79, 49, 79, 64, 37, 94, 30, 15, 56, 11]
len10 Comparison
[79, 49, 79, 64, 37, 94, 30, 15, 56, 11]
len10 Comparison 2
[79, 49, 79, 64, 37, 94, 30, 15, 56, 11]
Reward: -0.1
len10 Comparison 2 3 more
[79, 49, 79, 64, 37, 94, 30, 15, 56, 11]
Reward: 1.0
len10 Comparison 2 3 more Swap
[79, 49, 64, 79, 37, 94, 30, 15, 56, 11]



KeyboardInterrupt

