1. Import Libraries

In [1]:
import torch
print(torch.backends.mps.is_available()) #the MacOS is higher than 12.3+
print(torch.cuda.is_available()) #the MacOS is higher than 12.3+
import sys
print(sys.executable)


from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

True
False
/Users/lucasoliveira/miniconda3/bin/python


  from .autonotebook import tqdm as notebook_tqdm


2. Load Data and Inspect
Load your JSON data and inspect its structure.

In [2]:
import json

with open('data/nl2bash-data.json', 'r') as f:
    data = json.load(f)

# Show first few examples
for i, (key, value) in enumerate(data.items()):
    if i > 5:
        break
    print(f"ID: {key}, Invocation: {value['invocation']}, Command: {value['cmd']}")




ID: 1, Invocation: Copy loadable kernel module "mymodule.ko" to the drivers in modules directory matchig current kernel., Command: sudo cp mymodule.ko /lib/modules/$(uname -r)/kernel/drivers/
ID: 2, Invocation: Display all lines containing "IP_MROUTE" in the current kernel's compile-time config file., Command: cat /boot/config-`uname -r` | grep IP_MROUTE
ID: 3, Invocation: Display current running kernel's compile-time config file., Command: cat /boot/config-`uname -r`
ID: 4, Invocation: Find all loadable modules for current kernel, whose name includes "perf", Command: find /lib/modules/`uname -r` -regex .*perf.*
ID: 5, Invocation: Look for any instance of "HIGHMEM" in the current kernel's compile-time config file., Command: grep “HIGHMEM” /boot/config-`uname -r`
ID: 6, Invocation: Search for command "tail" in the maps of the process with PID 2671, Command: cat /proc/2671/maps | grep `which tail`


3. Preprocess Data
Convert your data into a format suitable for training. This might involve tokenization or other forms of preprocessing.

In [3]:
formatted_data = []

for key, value in data.items():
    invocation = value['invocation']
    cmd = value['cmd']
    formatted_data.append(f"{invocation} </s> {cmd}")

print(formatted_data)

train_data, val_data = train_test_split(formatted_data, test_size=0.2)

with open('data/formatted_train_nl2bash_data.txt', 'w') as f:
    for line in train_data:
        f.write(f"{line}\n")

with open('data/formatted_val_nl2bash_data.txt', 'w') as f:
    for line in val_data:
        f.write(f"{line}\n")



['Copy loadable kernel module "mymodule.ko" to the drivers in modules directory matchig current kernel. </s> sudo cp mymodule.ko /lib/modules/$(uname -r)/kernel/drivers/', 'Display all lines containing "IP_MROUTE" in the current kernel\'s compile-time config file. </s> cat /boot/config-`uname -r` | grep IP_MROUTE', "Display current running kernel's compile-time config file. </s> cat /boot/config-`uname -r`", 'Find all loadable modules for current kernel, whose name includes "perf" </s> find /lib/modules/`uname -r` -regex .*perf.*', 'Look for any instance of "HIGHMEM" in the current kernel\'s compile-time config file. </s> grep “HIGHMEM” /boot/config-`uname -r`', 'Search for command "tail" in the maps of the process with PID 2671 </s> cat /proc/2671/maps | grep `which tail`', "Display all lines containing PROBES in the current kernel's compile-time config file. </s> grep PROBES /boot/config-$(uname -r)", "Display all lines containing UTRACE in the current kernel's compile-time config fi

4. Initialize Model and Tokenizer

In [4]:
# Initialize the T5 base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        source_text, target_text = item.split(" </s> ")
        
        source_tokenized = self.tokenizer(
            source_text, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        target_tokenized = self.tokenizer(
            target_text, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        return {
            "input_ids": source_tokenized["input_ids"].squeeze(),
            "attention_mask": source_tokenized["attention_mask"].squeeze(),
            "labels": target_tokenized["input_ids"].squeeze()
        }


5. Create Dataset and DataLoader

In [6]:
# Create TextDataset objects for training and validation sets
train_dataset = CustomTextDataset(train_data, tokenizer)
val_dataset = CustomTextDataset(val_data, tokenizer)

# Create DataLoaders for training and validation sets
train_data_loader = DataLoader(train_dataset, batch_size=8)
val_data_loader = DataLoader(val_dataset, batch_size=8)

6. Initialize Data Collator

In [7]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)


7. Initialize Trainer

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',  # NEW: Logging directory
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print(train_dataset[0])
print(val_dataset[0])



{'input_ids': tensor([17483,  1429,     5,    75,     7,   208,  2073,    16,     8,   750,
         8174,  2195,    28,    70, 10552,     7,  5816,    16,     3,    87,
         5529,    87,  9269,    87,  8292,    87,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

8. Train the model

In [9]:
# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


 48%|████▊     | 500/1035 [15:04<13:37,  1.53s/it]

{'loss': 1.8979, 'learning_rate': 1.033816425120773e-05, 'epoch': 0.48}


 97%|█████████▋| 1000/1035 [30:11<01:21,  2.33s/it]

{'loss': 0.1973, 'learning_rate': 6.763285024154589e-07, 'epoch': 0.97}


                                                   
100%|██████████| 1035/1035 [32:37<00:00,  1.89s/it]


{'eval_loss': 0.17076970636844635, 'eval_runtime': 76.5517, 'eval_samples_per_second': 27.041, 'eval_steps_per_second': 3.383, 'epoch': 1.0}
{'train_runtime': 1957.2752, 'train_samples_per_second': 4.229, 'train_steps_per_second': 0.529, 'train_loss': 1.018580554648874, 'epoch': 1.0}


100%|██████████| 259/259 [01:17<00:00,  3.34it/s]

{'eval_loss': 0.17076970636844635, 'eval_runtime': 77.7017, 'eval_samples_per_second': 26.64, 'eval_steps_per_second': 3.333, 'epoch': 1.0}





Test model 

In [None]:
# Test the model on a sample input text
sample_input_text = "Display current running kernel's compile-time config file."
input_tokenized = tokenizer.encode_plus(
    sample_input_text, 
    max_length=512, 
    padding='max_length', 
    truncation=True, 
    return_tensors="pt"
)

# Generate output
output_ids = model.generate(
    input_ids=input_tokenized["input_ids"], 
    attention_mask=input_tokenized["attention_mask"]
)

# Decode and print the output
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Output:", decoded_output)


9. Save model

In [10]:
model.save_pretrained("model")
model = T5ForConditionalGeneration.from_pretrained("model")

