In [1]:
_CITATION = """\
@inproceedings{LinWZE2018:NL2Bash, 
  author = {Xi Victoria Lin and Chenglong Wang and Luke Zettlemoyer and Michael D. Ernst}, 
  title = {NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System}, 
  booktitle = {Proceedings of the Eleventh International Conference on Language Resources
               and Evaluation {LREC} 2018, Miyazaki (Japan), 7-12 May, 2018.},
  year = {2018} 
}
"""

_DESCRIPTION = """\
The dataset is constructed from
https://github.com/TellinaTool/nl2bash
"""

1. Import Libraries

In [2]:
import torch
from transformers import (T5ForConditionalGeneration, T5Tokenizer)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import json
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from nl2bash import CommandDataset
import os

os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

  from .autonotebook import tqdm as notebook_tqdm


2. Load Data and Inspect
Load your JSON data and inspect its structure.

In [3]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

data = load_data('data/nl2bash-data.json')


3. Preprocess Data
Convert your data into a format suitable for training. This might involve tokenization or other forms of preprocessing.

In [4]:
def preprocess_data(data):
    formatted_data = [f"{value['invocation']} </s> {value['cmd']} </s>" for key, value in data.items()]
    return train_test_split(formatted_data, test_size=0.2)

train_data, val_data = preprocess_data(data)

print(data['1']['invocation'])
print(data['1']['cmd'])

Copy loadable kernel module "mymodule.ko" to the drivers in modules directory matchig current kernel.
sudo cp mymodule.ko /lib/modules/$(uname -r)/kernel/drivers/


4. Initialize Model and Tokenizer

In [5]:
# Initialize the T5 base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
sample_encoding = tokenizer(data['1']['invocation'],data['1']['cmd'])
sample_encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [7]:
print(sample_encoding["input_ids"])

[20255, 4002, 179, 20563, 6008, 96, 2258, 22763, 15, 5, 157, 32, 121, 12, 8, 3863, 16, 10561, 8174, 1588, 23, 122, 750, 20563, 5, 1, 8411, 32, 3, 75, 102, 82, 22763, 15, 5, 157, 32, 3, 87, 6856, 87, 22763, 15, 7, 87, 3229, 599, 76, 4350, 3, 18, 52, 61, 87, 157, 11965, 40, 87, 13739, 52, 7, 87, 1]


In [8]:
print(sample_encoding["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
preds = [
    tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    for g in sample_encoding["input_ids"]
]
" ".join(preds)

'Copy load able kernel module " my modul e . k o " to the drivers in modules directory match i g current kernel . </s> sud o  c p my modul e . k o  / lib / modul e s / $ ( u name  - r ) / k erne l / drive r s / </s>'

In [10]:
input_test = tokenizer(
   data['1']['invocation'],
    data['1']['cmd'], 
    max_length=512, 
    padding="max_length", 
    return_tensors="pt"
)   

output_test = model(
    input_ids=input_test["input_ids"],
    attention_mask=input_test["attention_mask"],
    labels=input_test["input_ids"]
)

output_test.logits.shape


torch.Size([1, 512, 32128])

In [11]:
output_test.loss

tensor(15.8030, grad_fn=<NllLossBackward0>)

In [12]:
class NL2BashModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss)
        return loss
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, _ = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)
    

In [13]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  # or another metric that you want to monitor
    filename='checkpoint-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,  # number of best models to save
    mode='min',  # 'min' for minimizing the monitored metric, 'max' for maximizing
)

In [14]:
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=[checkpoint_callback],  # Pass as a list
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/lucasoliveira/miniconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [15]:
class NL2BashDataModel(pl.LightningDataModule):
    def __init__(self, train_data, val_data, tokenizer, batch_size=32, source_max_token_len=396, target_max_token_len=32):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        self.train_dataset = CommandDataset(self.train_data, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = CommandDataset(self.val_data, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        # Assuming test data is same as val_data; if not, adjust accordingly
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4)

In [16]:
model = NL2BashModel()

In [17]:
data_module = NL2BashDataModel(train_data, val_data, tokenizer)

In [18]:
trainer.fit(model, data_module)

Missing logger folder: /Users/lucasoliveira/Documents/tgs-model/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/lucasoliveira/miniconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/Users/lucasoliveira/miniconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 2: 100%|██████████| 259/259 [2:01:53<00:00,  0.04it/s, v_num=0, train_loss=0.802]  

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 259/259 [2:01:55<00:00,  0.04it/s, v_num=0, train_loss=0.802]


In [19]:
trainer.test(datamodule=data_module)

Restoring states from the checkpoint path at /Users/lucasoliveira/Documents/tgs-model/lightning_logs/version_0/checkpoints/checkpoint-epoch=02-val_loss=0.60.ckpt


Loaded model weights from the checkpoint at /Users/lucasoliveira/Documents/tgs-model/lightning_logs/version_0/checkpoints/checkpoint-epoch=02-val_loss=0.60.ckpt
/Users/lucasoliveira/miniconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 65/65 [02:42<00:00,  0.40it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.6008015275001526
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.6008015275001526}]

In [20]:
trained_model = NL2BashModel.load_from_checkpoint(checkpoint_callback.best_model_path)
trained_model.freeze()

In [21]:
def generate_answer(question, nl2bash_model, tokenizer, device='cpu', max_length=50):
    # Ensure the model is in evaluation mode
    nl2bash_model.eval()
    nl2bash_model = nl2bash_model.to(device)

    # Tokenize the question and prepare the input tensor
    input_ids = tokenizer.encode(question, return_tensors='pt').to(device)

    # Generate the answer using the T5 model inside your NL2BashModel
    output = nl2bash_model.model.generate(input_ids, max_length=max_length)

    # Decode and return the answer
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer




In [24]:
question = "List all files in the current directory"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model.to(device)

answer = generate_answer(question, trained_model, tokenizer, device)
print(answer)

find. -print


In [23]:
model.model.save_pretrained("model")
tokenizer.save_pretrained("model")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/spiece.model',
 'model/added_tokens.json')