In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from multilora import LoRAModel, MultiLoRALayerMaskingHom, MultiLoRALayerMaskingHomEfficient, MultiLoRALayerMasking, MultiLoRALayerSTK

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from multilora.benchmarking import MultiAdapterDataset, get_bitext_dataset, get_finetome_dataset, get_guanaco_dataset
N = 1000
model_id = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

dataset = MultiAdapterDataset([get_bitext_dataset(N, tokenizer), get_finetome_dataset(N, tokenizer), get_guanaco_dataset(N, tokenizer)], tokenizer)
n_adapters = 99
n_datasets = 3

Map: 100%|██████████| 1000/1000 [00:00<00:00, 2627.97 examples/s]


In [6]:
def create_lora_hom(in_features, out_features, adapter_ids):
    return MultiLoRALayerMaskingHom(in_features, out_features, adapter_ids, n_adapters=n_adapters, rank=32)

def create_lora_hom_eff(in_features, out_features, adapter_ids):
    return MultiLoRALayerMaskingHomEfficient(in_features, out_features, adapter_ids, n_adapters=n_adapters, rank=32)

def create_lora_het(in_features, out_features, adapter_ids):
    return MultiLoRALayerMasking(in_features, out_features, adapter_ids, ranks=[32] * n_adapters)

def create_lora_het_stk(in_features, out_features, adapter_ids):
    return MultiLoRALayerSTK(in_features, out_features, adapter_ids, ranks=[32] * n_adapters)

## Homogenious LoRA Adapters

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_hom).cuda().to(torch.bfloat16)
lora_model.freeze_base_model()

In [6]:
from torch.optim import AdamW
from transformers import get_scheduler

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [7]:
device = "cuda"

loss_fn = nn.CrossEntropyLoss()

def train_step(data):
    ids, masks, labels, adapter_ids = data
    adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
    adapter_ids %= n_adapters
    logits = lora_model(input_ids=ids.to(device), attention_mask=masks.to(device), adapter_ids=adapter_ids.to(device))[0]
    
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1).to('cuda'))
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()  
    lr_scheduler.step()

    return loss.item()

In [8]:
from tqdm import tqdm
from time import time
running_loss = None
alpha = 0.95
start = time()

iters = 0

for epoch in range(num_epochs):
    if iters >= 100:
        break
    for i, batch in tqdm(enumerate(dataloader)):
        loss = train_step(batch)
        if not running_loss:
            running_loss = loss
        else:
            running_loss = running_loss * alpha + loss * (1 - alpha)
        if iters % 20 == 19:
            print("AVG TIME:", (time() - start) / iters)
            print("LOSS:", running_loss)
        iters += 1
        if iters >= 100:
            break

0it [00:00, ?it/s]

20it [00:10,  1.97it/s]

AVG TIME: 0.5509742561139559
LOSS: 5.942682955053021


40it [00:20,  1.98it/s]

AVG TIME: 0.5269916424384484
LOSS: 6.373347408739958


60it [00:30,  1.98it/s]

AVG TIME: 0.5192565513869464
LOSS: 6.472410628159011


80it [00:40,  1.98it/s]

AVG TIME: 0.515574219860608
LOSS: 6.601495394466152


99it [00:50,  1.95it/s]

AVG TIME: 0.5131114010859017
LOSS: 6.56423440711488





## Homogenious Efficient

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_hom_eff).cuda().to(torch.bfloat16)
lora_model.freeze_base_model()

In [6]:
from torch.optim import AdamW
from transformers import get_scheduler

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [7]:
device = "cuda"

loss_fn = nn.CrossEntropyLoss()

def train_step(data):
    ids, masks, labels, adapter_ids = data
    adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
    adapter_ids %= n_adapters
    logits = lora_model(input_ids=ids.to(device), attention_mask=masks.to(device), adapter_ids=adapter_ids.to(device))[0]
    
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1).to('cuda'))
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()  
    lr_scheduler.step()

    return loss.item()

In [8]:
from tqdm import tqdm
from time import time
running_loss = None
alpha = 0.95
start = time()

iters = 0

for epoch in range(num_epochs):
    if iters >= 100:
        break
    for i, batch in tqdm(enumerate(dataloader)):
        loss = train_step(batch)
        if not running_loss:
            running_loss = loss
        else:
            running_loss = running_loss * alpha + loss * (1 - alpha)
        if iters % 20 == 19:
            print("AVG TIME:", (time() - start) / iters)
            print("LOSS:", running_loss)
        iters += 1
        if iters >= 100:
            break

21it [00:02, 10.69it/s]

AVG TIME: 0.11871823511625591
LOSS: 5.942794941708233


41it [00:04, 10.94it/s]

AVG TIME: 0.10463229203835511
LOSS: 6.374149546534218


61it [00:06, 10.84it/s]

AVG TIME: 0.10051029415453895
LOSS: 6.475503611293823


81it [00:07, 10.90it/s]

AVG TIME: 0.09805604777758635
LOSS: 6.603067205769785


99it [00:09, 10.31it/s]

AVG TIME: 0.09706232764504173
LOSS: 6.566872642360667





## Heterogenious Naive

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_het).cuda().to(torch.bfloat16)
lora_model.freeze_base_model()

In [6]:
from torch.optim import AdamW
from transformers import get_scheduler

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [7]:
device = "cuda"

loss_fn = nn.CrossEntropyLoss()

def train_step(data):
    ids, masks, labels, adapter_ids = data
    adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
    adapter_ids %= n_adapters
    logits = lora_model(input_ids=ids.to(device), attention_mask=masks.to(device), adapter_ids=adapter_ids.to(device))[0]
    
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1).to('cuda'))
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()  
    lr_scheduler.step()

    return loss.item()

In [8]:
from tqdm import tqdm
from time import time
running_loss = None
alpha = 0.95
start = time()

iters = 0

for epoch in range(num_epochs):
    if iters >= 100:
        break
    for i, batch in tqdm(enumerate(dataloader)):
        loss = train_step(batch)
        if not running_loss:
            running_loss = loss
        else:
            running_loss = running_loss * alpha + loss * (1 - alpha)
        if iters % 20 == 19:
            print("AVG TIME:", (time() - start) / iters)
            print("LOSS:", running_loss)
        iters += 1
        if iters >= 100:
            break

20it [00:06,  3.12it/s]

AVG TIME: 0.35765408214769867
LOSS: 5.941646360629132


40it [00:13,  3.12it/s]

AVG TIME: 0.3376512771997696
LOSS: 6.375575699811821


60it [00:19,  3.19it/s]

AVG TIME: 0.33037236989554714
LOSS: 6.478838890604542


80it [00:25,  3.13it/s]

AVG TIME: 0.3272677886335156
LOSS: 6.610806591793325


99it [00:32,  3.07it/s]

AVG TIME: 0.32538832317699085
LOSS: 6.583812531106318





## Heterogenious MegaBlocks

In [19]:
model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_het_stk).cuda()
lora_model.freeze_base_model()

In [20]:
from torch.optim import AdamW
from transformers import get_scheduler

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [21]:
device = "cuda"

loss_fn = nn.CrossEntropyLoss()

def train_step(data):
    ids, masks, labels, adapter_ids = data
    adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
    adapter_ids %= n_adapters
    logits = lora_model(input_ids=ids.to(device), attention_mask=masks.to(device), adapter_ids=adapter_ids.to(device))[0]
    
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1).to('cuda'))
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()  
    lr_scheduler.step()

    return loss.item()

In [22]:
from tqdm import tqdm
from time import time
running_loss = None
alpha = 0.95
start = time()

iters = 0

for epoch in range(num_epochs):
    if iters >= 100:
        break
    for i, batch in tqdm(enumerate(dataloader)):
        loss = train_step(batch)
        if not running_loss:
            running_loss = loss
        else:
            running_loss = running_loss * alpha + loss * (1 - alpha)
        if iters % 20 == 19:
            print("AVG TIME:", (time() - start) / iters)
            print("LOSS:", running_loss)
        iters += 1
        if iters >= 100:
            break

21it [00:02,  7.41it/s]

AVG TIME: 0.13420429982637105
LOSS: 5.939401582634258


41it [00:05,  8.03it/s]

AVG TIME: 0.1289974787296393
LOSS: 6.315966139261464


61it [00:07,  8.18it/s]

AVG TIME: 0.12772355241290592
LOSS: 6.213563603353293


81it [00:10,  7.87it/s]

AVG TIME: 0.12702260741704627
LOSS: 5.910232103905649


99it [00:12,  7.89it/s]

AVG TIME: 0.12677563079679854
LOSS: 5.276985791646863





In [23]:
n_adapters = 32
def create_lora_het_small(in_features, out_features, adapter_ids):
    return MultiLoRALayerSTK(in_features, out_features, adapter_ids, ranks=[32] * (n_adapters // 2) + [16] * (n_adapters // 2))

model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_het_small).cuda()
lora_model.freeze_base_model()

from torch.optim import AdamW
from transformers import get_scheduler

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Megablock inference

In [11]:
import gc
from time import time
from tqdm import tqdm
def experiment_megablocks(n_adapters, batch_size):
    gc.collect()
    torch.cuda.empty_cache()

    def create_lora_het_small(in_features, out_features, adapter_ids):
        return MultiLoRALayerSTK(in_features, out_features, adapter_ids, ranks=[32] * (n_adapters // 2) + [16] * (n_adapters // 2))
    
    model = GPT2LMHeadModel.from_pretrained(model_id, device_map="auto").to(dtype=torch.bfloat16)
    lora_model = LoRAModel(model, target_modules=["c_attn"], lora_factory=create_lora_het_small).cuda()
    lora_model.freeze_base_model()
    lora_model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn)
    start = time()
    for i, batch in tqdm(enumerate(dataloader)):
        ids, masks, labels, adapter_ids = batch
        adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
        adapter_ids %= n_adapters
        lora_model.generate(ids.to('cuda'), adapter_ids=adapter_ids.to('cuda'), max_length=1024)

        print("AVG TIME:", (time() - start) / (i+1))
        if i == 1:
            return (time() - start) / (i+1)
        
def experiment_peft(n_adapters, batch_size):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import get_peft_model, LoraConfig, PeftModel
    import torch

    gc.collect()
    torch.cuda.empty_cache()

    # Configuration
    model_name = "openai-community/gpt2"
    target_modules = ["c_attn"]

    # Load base model & tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto").to(dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create the initial LoRA config
    first_adapter_name = f"adapter_0"
    first_config = LoraConfig(
        r=32,
        lora_alpha=32,
        target_modules=target_modules,
        task_type="CAUSAL_LM"
    )

    # Wrap the model with the first adapter
    model = get_peft_model(model, first_config, adapter_name=first_adapter_name)

    # Add the remaining adapters
    for i in range(1, n_adapters):
        r = 32 if i < n_adapters // 2 else 16
        config = LoraConfig(
            r=r,
            lora_alpha=r,
            target_modules=target_modules,
            task_type="CAUSAL_LM"
        )
        adapter_name = f"adapter_{i}"
        model.add_adapter(adapter_name, config)

    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn)
    start = time()
    for i, batch in tqdm(enumerate(dataloader)):
        ids, masks, labels, adapter_ids = batch
        adapter_ids = adapter_ids + torch.randint_like(adapter_ids, low=0, high=n_adapters // n_datasets - 1) * n_datasets
        adapter_ids %= n_adapters

        adapter_names = [f"adapter_{i}" for i in adapter_ids]
        
        model.generate(ids.to('cuda'), max_length=1024, adapter_names=adapter_names)

        print("AVG TIME:", (time() - start) / (i+1))
        if i == 1:
            return (time() - start) / (i+1)

    

In [12]:
results = []
for batch_size in [4, 16, 64]:
    for n_adapters in [6, 18, 66]:
        results.append({
            "method": "megablocks",
            "batch_size": batch_size,
            "n_adapters": n_adapters,
            "time": experiment_megablocks(n_adapters=n_adapters, batch_size=batch_size)
        })
        results.append({
            "method": "peft",
            "batch_size": batch_size,
            "n_adapters": n_adapters,
            "time": experiment_peft(n_adapters=n_adapters, batch_size=batch_size)
        })

0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:09,  9.71s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-pad

AVG TIME: 9.710920572280884


1it [00:18, 18.57s/it]

AVG TIME: 9.283583641052246



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:08,  8.48s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 8.482093811035156


1it [00:16, 16.54s/it]

AVG TIME: 8.272502422332764



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:08,  8.69s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 8.68769884109497


1it [00:17, 17.40s/it]

AVG TIME: 8.701053261756897



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:09,  9.19s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 9.188240766525269


1it [00:18, 18.48s/it]

AVG TIME: 9.241745710372925



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:08,  8.67s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 8.668257474899292


1it [00:17, 17.59s/it]

AVG TIME: 8.796303272247314



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:09,  9.34s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 9.339196681976318


1it [00:18, 18.56s/it]

AVG TIME: 9.27846908569336



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:09,  9.07s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 9.068512916564941


1it [00:18, 18.10s/it]

AVG TIME: 9.052245378494263



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:08,  8.22s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 8.22230315208435


1it [00:16, 16.74s/it]

AVG TIME: 8.3727947473526



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:09,  9.04s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 9.039153337478638


1it [00:17, 17.87s/it]

AVG TIME: 8.934510827064514



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:19, 19.05s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 19.05049705505371


1it [00:37, 37.28s/it]

AVG TIME: 18.641412258148193



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:08,  8.88s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 8.883249044418335


1it [00:17, 17.71s/it]

AVG TIME: 8.853596448898315



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:21, 21.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 21.556719303131104


1it [00:47, 47.34s/it]

AVG TIME: 23.672595143318176



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:11, 11.81s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 11.815491437911987


1it [00:23, 23.55s/it]

AVG TIME: 11.774531722068787



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:10, 10.40s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 10.399140119552612


1it [00:20, 20.86s/it]

AVG TIME: 10.429537892341614



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:11, 11.97s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 11.967139720916748


1it [00:23, 23.70s/it]

AVG TIME: 11.852453589439392



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:28, 28.03s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 28.035893201828003


1it [00:56, 56.07s/it]

AVG TIME: 28.036820888519287



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [00:11, 11.61s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 11.60667872428894


1it [00:23, 23.22s/it]

AVG TIME: 11.611173510551453



0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
1it [01:03, 63.30s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


AVG TIME: 63.30027890205383


1it [02:06, 126.94s/it]

AVG TIME: 63.46920549869537





In [13]:
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("results_inference.csv")

In [16]:
df.head(10)

Unnamed: 0,method,batch_size,n_adapters,time
0,megablocks,4,6,9.2837
1,peft,4,6,8.272603
2,megablocks,4,18,8.701163
3,peft,4,18,9.241866
4,megablocks,4,66,8.796409
5,peft,4,66,9.278574
6,megablocks,16,6,9.052353
7,peft,16,6,8.372927
8,megablocks,16,18,8.934639
9,peft,16,18,18.641531


In [38]:
import seaborn as sns
viz = df.groupby(['batch_size', 'n_adapters']).apply(
    lambda x: f"{x[x.method=='megablocks']['time'].item():.2f}/{x[x.method=='peft']['time'].item():.2f}"
).reset_index()
viz.pivot(columns='batch_size', index='n_adapters').reset_index()

  viz = df.groupby(['batch_size', 'n_adapters']).apply(


Unnamed: 0_level_0,n_adapters,0,0,0
batch_size,Unnamed: 1_level_1,4,16,64
0,6,9.28/8.27,9.05/8.37,11.77/10.43
1,18,8.70/9.24,8.93/18.64,11.85/28.04
2,66,8.80/9.28,8.85/23.67,11.61/63.47


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from multilora.benchmarking import MultiAdapterDataset, get_bitext_dataset, get_finetome_dataset, get_guanaco_dataset
N = 1000
model_id = "openai-community/gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

In [8]:
from torch.utils.data import DataLoader
dataset = MultiAdapterDataset([get_bitext_dataset(N, tokenizer), get_finetome_dataset(N, tokenizer), get_guanaco_dataset(N, tokenizer)], tokenizer)
n_adapters = 99
n_datasets = 3

dataloader = DataLoader(dataset, batch_size=8, collate_fn=dataset.collate_fn)

dataloaderbig = DataLoader(dataset, batch_size=16, collate_fn=dataset.collate_fn)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4450.78 examples/s]


In [13]:
from time import time
from tqdm import trange
import torch

d_iter = iter(dataloader)
start = time()

for i in trange(200):
    batch = next(d_iter)
    ids, masks, labels, adapter_ids = batch
    with torch.no_grad():
        _ = model(input_ids = ids.to('cuda'))

print(time() - start)

100%|██████████| 200/200 [00:37<00:00,  5.37it/s]

37.276371717453





In [14]:
from time import time
from tqdm import trange

d_iter = iter(dataloaderbig)
start = time()

for i in trange(100):
    batch = next(d_iter)
    ids, masks, labels, adapter_ids = batch
    with torch.no_grad():
        _ = model(input_ids = ids.to('cuda'))

print(time() - start)

100%|██████████| 100/100 [00:36<00:00,  2.71it/s]

36.87265968322754



