### Install and import packages, helper functions

In [None]:
#!pip install datasets transformers[torch] accelerate bitsandbytes peft==0.6.2 -U -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
from transformers import Trainer, TrainingArguments
from transformers import PretrainedConfig, PreTrainedModel
from transformers import BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel

In [3]:
def model_size_in_MB(model):
  param_size = 0
  for param in model.parameters():
      param_size += param.nelement() * param.element_size()
  buffer_size = 0
  for buffer in model.buffers():
      buffer_size += buffer.nelement() * buffer.element_size()

  size_all_mb = (param_size + buffer_size) / 1024**2
  print('model size: {:.3f}MB'.format(size_all_mb))

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

### PretrainedConfig, PreTrainedModel and Dataset

In [4]:
class MLPConfig(PretrainedConfig):
    model_type = "screen_mlp"

    def __init__(
        self,
        num_features: int = 256,
        num_classes: int = 3,
        **kwargs,
    ):
        self.num_features = num_features
        self.num_classes = num_classes
        super().__init__(**kwargs)

In [5]:
class MLP(PreTrainedModel):
    config_class = MLPConfig

    def __init__(self, config):
        super().__init__(config)
        self.fc1 = nn.Linear(config.num_features, config.num_features)
        self.activation = nn.ReLU()
        self.fc2 = nn.Linear(config.num_features, config.num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self._no_split_modules = ['fc1', 'fc2']

    def forward(self, input_ids, labels=None):
        x = self.fc1(input_ids)
        x = self.activation(x)
        logits = self.fc2(x)
        if labels is None:
          return {'logits': logits}
        else:
          loss = self.criterion(logits, labels)
          return {'loss': loss, 'logits': logits}

In [6]:
class MLPDataset(torch.utils.data.Dataset):

    def __init__(self, len):
        self.len = len

    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = torch.rand(20).float()
        item['labels'] = torch.rand(3).float()
        return item

    def __len__(self):
        return self.len

### Train and Save custom model using HF Trainer

In [7]:
train_dataset = MLPDataset(256)
val_dataset = MLPDataset(128)

training_args = TrainingArguments(
        output_dir='./checkpoint',
        num_train_epochs=3,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        report_to='none',
        save_strategy='no',
        fp16=True,
        remove_unused_columns=False
    )

mlp_config = MLPConfig(20, 3)
model = MLP(mlp_config)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )


Using cuda_amp half precision backend


In [8]:
trainer.train()
model.save_pretrained("./output")

***** Running training *****
  Num examples = 256
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 768
  Number of trainable parameters = 483
 73%|███████▎  | 561/768 [00:01<00:00, 572.58it/s]

{'loss': 1.6752, 'learning_rate': 1.7447916666666666e-05, 'epoch': 1.95}


 92%|█████████▏| 706/768 [00:01<00:00, 640.57it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 768/768 [00:01<00:00, 394.69it/s]
Configuration saved in ./output\config.json
Model weights saved in ./output\pytorch_model.bin


{'train_runtime': 1.9461, 'train_samples_per_second': 394.644, 'train_steps_per_second': 394.644, 'train_loss': 1.6779466072718303, 'epoch': 3.0}


### Create LoRA Model

In [9]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
base_model = MLP.from_pretrained("./output")
base_model = prepare_model_for_kbit_training(base_model)

loading configuration file ./output\config.json
Model config MLPConfig {
  "architectures": [
    "MLP"
  ],
  "model_type": "screen_mlp",
  "num_classes": 3,
  "num_features": 20,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1"
}

loading weights file ./output\pytorch_model.bin
All model checkpoint weights were used when initializing MLP.

All the weights of MLP were initialized from the model checkpoint at ./output.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MLP for predictions without further training.


In [10]:
peft_model1 = get_peft_model(base_model, lora_config)
print_trainable_parameters(peft_model1)
trainer1 = Trainer(
        model=peft_model1,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
trainer1.train()
peft_model1.save_pretrained('./lora1')

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 256
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 768
  Number of trainable parameters = 1008


trainable params: 1008 || all params: 1491 || trainable%: 67.61


 72%|███████▏  | 556/768 [00:01<00:00, 407.85it/s]

{'loss': 1.6732, 'learning_rate': 1.7447916666666666e-05, 'epoch': 1.95}


 97%|█████████▋| 748/768 [00:01<00:00, 454.31it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 768/768 [00:01<00:00, 425.14it/s]

{'train_runtime': 1.8084, 'train_samples_per_second': 424.673, 'train_steps_per_second': 424.673, 'train_loss': 1.6763070821762085, 'epoch': 3.0}





In [11]:
peft_model2 = get_peft_model(base_model, lora_config)
print_trainable_parameters(peft_model2)
trainer2 = Trainer(
        model=peft_model2,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
trainer2.train()
peft_model1.save_pretrained('./lora2')

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 256
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 768
  Number of trainable parameters = 1008


trainable params: 1008 || all params: 1491 || trainable%: 67.61


 74%|███████▎  | 566/768 [00:01<00:00, 438.24it/s]

{'loss': 1.6732, 'learning_rate': 1.7447916666666666e-05, 'epoch': 1.95}


 99%|█████████▊| 757/768 [00:01<00:00, 465.04it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 768/768 [00:01<00:00, 452.25it/s]

{'train_runtime': 1.6972, 'train_samples_per_second': 452.515, 'train_steps_per_second': 452.515, 'train_loss': 1.6762625376383464, 'epoch': 3.0}





### Inference LoRA

In [23]:
base_model = MLP.from_pretrained("./output", device_map='auto')
inf_model = PeftModel.from_pretrained(base_model, './lora1', adapter_name="adapter1", device_map='auto')
inf_model.load_adapter('./lora2', adapter_name="adapter2", device_map='auto')

loading configuration file ./output\config.json
Model config MLPConfig {
  "architectures": [
    "MLP"
  ],
  "model_type": "screen_mlp",
  "num_classes": 3,
  "num_features": 20,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1"
}

loading weights file ./output\pytorch_model.bin
All model checkpoint weights were used when initializing MLP.

All the weights of MLP were initialized from the model checkpoint at ./output.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MLP for predictions without further training.


_IncompatibleKeys(missing_keys=['base_model.model.fc1.base_layer.weight', 'base_model.model.fc1.base_layer.bias', 'base_model.model.fc1.lora_A.adapter1.weight', 'base_model.model.fc1.lora_B.adapter1.weight', 'base_model.model.fc2.base_layer.weight', 'base_model.model.fc2.base_layer.bias', 'base_model.model.fc2.lora_A.adapter1.weight', 'base_model.model.fc2.lora_B.adapter1.weight'], unexpected_keys=[])

In [25]:
inputs1 = {'input_ids' : torch.rand(2, 20).float().to("cuda:0")}
inf_model.set_adapter("adapter1")

for x in inf_model.parameters():
  x.requires_grad = False
inf_model.eval()
output = inf_model(**inputs1)
print(output)

{'logits': tensor([[-0.0892,  0.1122, -0.0155],
        [ 0.0974,  0.0344,  0.0315]], device='cuda:0')}


In [26]:
inputs2 = {'input_ids' : torch.rand(2, 20).float().to("cuda:0")}
inf_model.set_adapter("adapter2")

for x in inf_model.parameters():
  x.requires_grad = False
inf_model.eval()
output = inf_model(**inputs2)
print(output)

{'logits': tensor([[0.0214, 0.0763, 0.1080],
        [0.0904, 0.1173, 0.1598]], device='cuda:0')}


### Create QLoRA Model

In [None]:
#bnb_config = BitsAndBytesConfig(
#    load_in_8bit = True,
#)
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model_nf4 = MLP.from_pretrained("./output", quantization_config=nf4_config)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)


### Train and Save QLoRA model using HF Trainer

In [None]:
peft_model1 = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model1)
trainer1 = Trainer(
        model=peft_model1,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
trainer1.train()
peft_model1.save_pretrained('./lora1')


In [None]:
peft_model2 = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model2)
trainer2 = Trainer(
        model=peft_model2,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
trainer2.train()
peft_model2.save_pretrained('./lora2')

### Inference using LoRA Model

In [None]:
inf_nf4_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype=torch.float32,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type='nf4'
)

base_model = MLP.from_pretrained("./output",
                                load_in_4bit=True,
                                torch_dtype=torch.float32,
                                quantization_config=inf_nf4_config, device_map='auto')
inf_model = PeftModel.from_pretrained(base_model, './lora1', adapter_name="adapter1", device_map='auto')

inf_model.load_adapter('./lora2', adapter_name="adapter2", device_map='auto')
inf_model.to("cuda:0") #not sure why load_adapter will load the lora to difference devices

In [None]:
inputs1 = {'input_ids' : torch.rand(2, 20).float().to("cuda:0")}
inf_model.set_adapter("adapter1")
for x in inf_model.parameters():
  x.requires_grad = False
inf_model.eval()
output = inf_model(**inputs1)
print(output)

In [None]:
inputs2 = {'input_ids' : torch.rand(2, 20).float().to("cuda:0")}
inf_model.set_adapter("adapter2")
for x in inf_model.parameters():
  x.requires_grad = False
inf_model.eval()
output = inf_model(**inputs2)
print(output)