In [1]:
import torch
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

MAX_LENGTH = 1024

# Define the vocabulary
vocab = {
    'Comparison': 0,
    'Swap': 1,
    'less': 2,
    'equal': 3,
    'more': 4,
    '0': 5,
    '1': 6,
    '2': 7,
    '3': 8,
    '4': 9,
    '5': 10,
    '6': 11,
    '7': 12,
    '8': 13,
    '9': 14,
    '10': 15,
    '11': 16,
    '12': 17,
    '13': 18,
    '14': 19,
    '15': 20,
    'len1': 21,
    'len2': 22,
    'len3': 23,
    'len4': 24,
    'len5': 25,
    'len6': 26,
    'len7': 27,
    'len8': 28,
    'len9': 29,
    'len10': 30,
    'len11': 31,
    'len12': 32,
    'len13': 33,
    'len14': 34,
    'len15': 35,
    'len16': 36,
    'pos0': 37,
    'pos1': 38,
    'pos2': 39,
    'pos3': 40,
    'pos4': 41,
    'pos5': 42,
    'pos6': 43,
    'pos7': 44,
    'pos8': 45,
    'pos9': 46,
    'pos10': 47,
    'pos11': 48,
    'pos12': 49,
    'pos13': 50,
    'pos14': 51,
    'pos15': 52,
}

# Reverse vocabulary mapping
id_to_token = {v: k for k, v in vocab.items()}

input_file_path = "/home/mcwave/code/autocode/datasets/rl_sort_transformer_easy/list16_transformer4_192_gamma07_step640_v3_data/action_sequence.txt"

# Load the dataset from a text file
dataset = load_dataset('text', data_files=input_file_path)

# Create a tokenizer using the given vocabulary
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

tokenizer_model = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
tokenizer_model.pre_tokenizer = Whitespace()
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_model, unk_token="[UNK]")

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', max_length=MAX_LENGTH, truncation=True, return_tensors='pt')

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define label tokens and their IDs
label_tokens = ['more', 'less', 'equal'] + [f'pos{i}' for i in range(16)]
label_ids = [vocab[token] for token in label_tokens]

# Prepare the labels in the dataset
def prepare_labels(examples):
    input_ids = examples['input_ids']
    labels = torch.clone(torch.tensor(input_ids))
    mask = torch.isin(labels, torch.tensor(label_ids))
    labels[~mask] = -100  # Set labels not in label_ids to -100
    examples['labels'] = labels.tolist()
    return examples

tokenized_datasets = tokenized_datasets.map(prepare_labels, batched=True)

def mask_attention(examples):
    input_ids = torch.tensor(examples['input_ids'])
    attention_mask = torch.tensor(examples['attention_mask'])
    pos_ids = torch.tensor([vocab[f'pos{i}'] for i in range(16)])
    
    # Create a mask for 'pos0' to 'pos15'
    pos_mask = torch.isin(input_ids, pos_ids)
    
    # Set attention mask for these tokens to 0
    attention_mask[pos_mask] = 0
    
    # Convert back to lists for compatibility with Hugging Face datasets
    examples['attention_mask'] = attention_mask.tolist()
    return examples


tokenized_datasets = tokenized_datasets.map(mask_attention, batched=True)

# Split dataset into train and validation
split_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)
tokenized_train = split_datasets['train']
tokenized_test = split_datasets['test']

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# Load the model and adjust the embedding size
model_name = 'Qwen/Qwen2.5-0.5B'  # Replace with 'Gemma-2-2b' or your specific model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="datasets/predict-action-outcome-qwen05b",
    evaluation_strategy="steps", #"epochs"
    learning_rate=1e-5,  # PAY ATTENTION TO LEARNING RATE!
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    bf16=True,
    max_grad_norm=1.0,
    save_steps=10000,
    eval_steps=10000,
    logging_steps=10000,
    save_total_limit=3,
    #load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

#cp_path = 'datasets/predict-hyp-inst-label-gemma-2b/checkpoint-680000'

trainer.train()



[2024-12-02 23:19:29,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/mcwave/anaconda3/envs/axiom/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'

Step,Training Loss,Validation Loss


In [None]:
"""
Prompt:

I want to train a model predict intermediate results in a sorting process, where the algorithm has only two
operations: Comparison and Swap. Here is an example sequence:

len16 Comparison 1 7 more pos15 pos4 Swap Comparison 7 12 more pos15 pos13 Swap Comparison 12 15 more pos15 pos8 Swap Comparison 11 12 less pos1 pos8 Comparison 6 9 more pos10 pos3 Swap Comparison 7 14 less pos13 pos14 Comparison 1 7 less pos4 pos13 Comparison 6 11 more pos3 pos1 Swap Comparison 2 7 less pos7 pos13 Comparison 7 12 more pos13 pos8 Swap Comparison 14 15 less pos14 pos15 Comparison 5 7 less pos6 pos8 Comparison 1 5 less pos4 pos6 Comparison 10 15 less pos11 pos15 Comparison 13 14 less pos12 pos14 Comparison 6 14 less pos1 pos14 Comparison 7 11 more pos8 pos3 Swap Comparison 11 14 less pos8 pos14 Comparison 7 11 less pos3 pos8 Comparison 3 7 more pos9 pos3 Swap Comparison 7 10 less pos9 pos11 Comparison 5 7 less pos6 pos9 Comparison 8 11 less pos0 pos8

Where len16 says the array has 16 elements. "Comparison 1 7 more pos15 pos4" means it is a comparison operation
on elements at position 1 and position 7 (both 0-indexed). "more" means the first element is more than the second.
"pos15" means the first element should be put at position 15 in the sorted array. "Swap" means swap these two elements.

There can be "success" and "failure" in the sequence, which should be ignored.

Here is my vocab:

vocab = {
    'Comparison': 0,
    'Swap': 1,
    'less': 2,
    'equal': 3,
    'more': 4,
    '0': 5,
    '1': 6,
    '2': 7,
    '3': 8,
    '4': 9,
    '5': 10,
    '6': 11,
    '7': 12,
    '8': 13,
    '9': 14,
    '10': 15,
    '11': 16,
    '12': 17,
    '13': 18,
    '14': 19,
    '15': 20,
    'len1': 21,
    'len2': 22,
    'len3': 23,
    'len4': 24,
    'len5': 25,
    'len6': 26,
    'len7': 27,
    'len8': 28,
    'len9': 29,
    'len10': 30,
    'len11': 31,
    'len12': 32,
    'len13': 33,
    'len14': 34,
    'len15': 35,
    'len16': 36,
    'pos0': 37,
    'pos1': 38,
    'pos2': 39,
    'pos3': 40,
    'pos4': 41,
    'pos5': 42,
    'pos6': 43,
    'pos7': 44,
    'pos8': 45,
    'pos9': 46,
    'pos10': 47,
    'pos11': 48,
    'pos12': 49,
    'pos13': 50,
    'pos14': 51,
    'pos15': 52,
}

Now please write code to use huggingface AutoModelForCausalLM (such as Gemma-2-2b) to train a model to predict the
next token. It should only consider the following tokens as labels: more, less, equal, pos0, pos1, ..., pos15.
It should not use the following tokens in attention (i.e., these tokens should be invisible in the training process):
pos0, pos1, ..., pos15.

The code should load data from a text file, which contains many lines and one line for each sequence.

"""