In [None]:
_CITATION = """\
@inproceedings{LinWZE2018:NL2Bash, 
  author = {Xi Victoria Lin and Chenglong Wang and Luke Zettlemoyer and Michael D. Ernst}, 
  title = {NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System}, 
  booktitle = {Proceedings of the Eleventh International Conference on Language Resources
               and Evaluation {LREC} 2018, Miyazaki (Japan), 7-12 May, 2018.},
  year = {2018} 
}
"""

_DESCRIPTION = """\
The dataset is constructed from
https://github.com/TellinaTool/nl2bash
"""

1. Import Libraries

In [96]:
import torch
from transformers import (T5ForConditionalGeneration, T5Tokenizer, 
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, 
                          Seq2SeqTrainer)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import json
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

2. Load Data and Inspect
Load your JSON data and inspect its structure.

In [97]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

data = load_data('data/nl2bash-data.json')


3. Preprocess Data
Convert your data into a format suitable for training. This might involve tokenization or other forms of preprocessing.

In [98]:
def preprocess_data(data):
    formatted_data = [f"{value['invocation']} </s> {value['cmd']} </s>" for key, value in data.items()]
    return train_test_split(formatted_data, test_size=0.2)

train_data, val_data = preprocess_data(data)

4. Initialize Model and Tokenizer

In [99]:
# Initialize the T5 base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")


In [100]:
sample_encoding = tokenizer("Display current running kernel's compile-time config file.","cat /boot/config-`uname -r`")
sample_encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [101]:
print(sample_encoding["input_ids"])

[11677, 750, 1180, 20563, 31, 7, 2890, 699, 18, 715, 3, 20303, 1042, 5, 1, 1712, 3, 87, 18475, 87, 20303, 18, 2, 76, 4350, 3, 18, 52, 2, 1]


In [102]:
print(sample_encoding["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [103]:
preds = [
    tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    for g in sample_encoding["input_ids"]
]
" ".join(preds)

"Display current running kernel ' s comp ile - time  config file . </s> cat  / boot / config - <unk> u name  - r <unk> </s>"

In [104]:
input_test = tokenizer(
    "Display current running kernel's compile-time config file.",
    "cat /boot/config-`uname -r`", 
    max_length=512, 
    padding="max_length", 
    return_tensors="pt"
)   

output_test = model(
    input_ids=input_test["input_ids"],
    attention_mask=input_test["attention_mask"],
    labels=input_test["input_ids"]
)

output_test.logits.shape


torch.Size([1, 512, 32128])

In [105]:
output_test.loss

tensor(14.7304, grad_fn=<NllLossBackward0>)

In [106]:
class BioQAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss)
        return loss
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)
    

In [107]:
model = BioQAModel()

In [108]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  # or another metric that you want to monitor
    filename='checkpoint-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,  # number of best models to save
    mode='min',  # 'min' for minimizing the monitored metric, 'max' for maximizing
)

In [110]:
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=[checkpoint_callback],  # Pass as a list
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/lucasoliveira/miniconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [112]:
data_module = BioQAModel(train_data, val_data, tokenizer)
trainer.fit(model1, data_module)

TypeError: BioQAModel.__init__() takes 1 positional argument but 4 were given

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128, max_input_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        source_text, target_text = item.split(" </s> ")
        
        source_tokenized = self.tokenizer(
            source_text, 
            max_length=self.max_input_length, 
            padding='max_length',
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )
        
        target_tokenized = self.tokenizer(
            target_text, 
            max_length=self.max_length, 
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        
        return {
            "input_ids": source_tokenized["input_ids"].squeeze(),
            "attention_mask": source_tokenized["attention_mask"].squeeze(),
            "labels": target_tokenized["input_ids"].squeeze()
        }

# Custom collate function for dynamic padding
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Usage
# dataset = CustomTextDataset(data, tokenizer)
# dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)


5. Create Dataset and DataLoader

In [None]:
train_dataset = CustomTextDataset(train_data, tokenizer)
val_dataset = CustomTextDataset(val_data, tokenizer)
train_data_loader = DataLoader(train_dataset, batch_size=32)
val_data_loader = DataLoader(val_dataset, batch_size=32)

# Print an example from the training dataset
print(train_dataset[100])


6. Initialize Data Collator

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


7. Initialize Trainer

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',  # NEW: Logging directory
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


print(train_dataset[0])
print(val_dataset[0])



8. Train the model

In [None]:
# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


Test model 

In [None]:
def test_model(model, tokenizer, sample_input_text):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device).eval()
    input_tokenized = tokenizer.encode_plus(
        sample_input_text, 
        max_length=512, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        output_ids = model.generate(**input_tokenized, max_length=50, min_length=5, temperature=1.0)

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

sample_input_text = "Display current running kernel's compile-time config file."
print(test_model(model, tokenizer, sample_input_text))



9. Save model

In [None]:
model.save_pretrained("model")
model = T5ForConditionalGeneration.from_pretrained("model")
tokenizer.save_pretrained("model")

