In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune Bert MRPC

Tutorial : https://huggingface.co/docs/transformers/training

In [2]:
from transformers import BertTokenizer, BertModel,BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('Intel/bert-base-uncased-mrpc')
model = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)



In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","mrpc")
tokenizer = BertTokenizer.from_pretrained('Intel/bert-base-uncased-mrpc')

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1","sentence2","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)



DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn = data_collator
)


In [5]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr=5e-5)




In [6]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [7]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # encode_input = {}

    # for i in ['input_ids', 'token_type_ids', 'attention_mask']:
    #     encode_input[i] = batch[i].to(device)
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res = metric.compute()
res[f"{device} time"] = end-start
res


{'accuracy': 0.8307246376811595,
 'f1': 0.87943848059455,
 'cuda time': 5.3341896533966064}

In [8]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

device = torch.device("cpu")
model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res[f"cpu time"] = end-start

res

{'accuracy': 0.8307246376811595,
 'f1': 0.87943848059455,
 'cuda time': 5.3341896533966064,
 'cpu time': 64.74835252761841}

In [9]:
import pickle
# with open("./models/bert_mrpc.pkl", "wb") as f:
#     pickle.dump(model, f)


In [10]:
import torch
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

size_in_mb = get_model_size(model)
res["size"] = size_in_mb
res

{'accuracy': 0.8307246376811595,
 'f1': 0.87943848059455,
 'cuda time': 5.3341896533966064,
 'cpu time': 64.74835252761841,
 'size': 417.65528106689453}

In [11]:
import json
with open("results/bert_mrpc.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [12]:
model = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [13]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_int8(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.8234782608695652,
 'f1': 0.874510611992582,
 'cpu time': 41.94166564941406}

In [14]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8234782608695652,
 'f1': 0.874510611992582,
 'cpu time': 41.94166564941406,
 'cuda time': None,
 'size': 91.080078125}

In [15]:
import json
with open("results/bert_mrpc_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [16]:

device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [17]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_float16(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.831304347826087,
 'f1': 0.8798017348203222,
 'cpu time': 66.82392764091492}

In [18]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.831304347826087,
 'f1': 0.8798017348203222,
 'cpu time': 66.82392764091492,
 'cuda time': None,
 'size': 91.080078125}

In [19]:
import json
with open("results/bert_mrpc_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [20]:
import torch.nn.utils.prune as prune
model_prun_unstructure = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")
# model.bert.embeddings

In [21]:
from evaluate import load
import time
import os

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_unstructure = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        amt = i/10
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.query, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.key, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.value, name="weight", amount=amt)
        
        # Access feed-forward layers (intermediate dense layer)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight", amount=amt)
        
        # Optionally, prune the output dense layer (if desired)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].output.dense, name="weight", amount=amt)
    
    
        # Access attention layers (query, key, value)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.query, name="weight")
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.key, name="weight")
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.value, name="weight")
        
        # Access feed-forward layers (intermediate dense layer)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight")
        
        # Optionally, prune the output dense layer (if desired)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].output.dense, name="weight")

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%
{'accuracy': 0.8284057971014492, 'f1': 0.8777869529314616}
prun percent 20%
{'accuracy': 0.8301449275362319, 'f1': 0.8797702092736972}
prun percent 30%
{'accuracy': 0.8330434782608696, 'f1': 0.8795986622073578}
prun percent 40%
{'accuracy': 0.8249275362318841, 'f1': 0.8743760399334443}
prun percent 50%
{'accuracy': 0.7582608695652174, 'f1': 0.8003829583532791}
prun percent 60%
{'accuracy': 0.3408695652173913, 'f1': 0.01728608470181504}
prun percent 70%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 80%
{'accuracy': 0.3356521739130435, 'f1': 0.0017421602787456446}
prun percent 90%
{'accuracy': 0.6608695652173913, 'f1': 0.7953830010493179}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8777869529314616,
  0.8797702092736972,
  0.8795986622073578,
  0.8743760399334443,
  0.8003829583532791,
  0.01728608470181504,
  0.0,
  0.0017421602787456446,
  0.7953830010493179],
 'cuda time': [4.745062589645386,
  5.883496284484863,
  5.7819695472717285,
  5.738783836364746,
  5.856553077697754,
  5.715821743011475,
  5.7710862159729,
  5.618096590042114,
  5.6026082038879395],
 'cpu time': [69.57550406455994,
  65.29263353347778,
  66.0870943069458,
  65.65327739715576,
  66.06130146980286,
  65.49623966217041,
  66.73524117469788,
  65.88913536071777,
  65.4354636669159],
 'accuracy': [0.8284057971014492,
  0.8301449275362319,
  0.8330434782608696,
  0.8249275362318841,
  0.7582608695652174,
  0.3408695652173913,
  0.33507246376811595,
  0.3356521739130435,
  0.6608695652173913],
 'type': ['unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure

In [22]:
import json
with open("results/bert_mrpc_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


##### Prun structure 


In [23]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")
    amt = i/10

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query, name="weight", amount=amt,n=2,dim=0)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.key, name="weight", amount=amt,n=2,dim=0)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.value, name="weight", amount=amt,n=2,dim=0)
        
        # Access feed-forward layers (intermediate dense layer)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight", amount=amt,n=2,dim=0)
        
        # Optionally, prune the output dense layer (if desired)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].output.dense, name="weight", amount=amt,n=2,dim=0)
    
    
        # Access attention layers (query, key, value)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query, name="weight")
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.key, name="weight")
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.value, name="weight")
        
        # Access feed-forward layers (intermediate dense layer)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight")
        
        # Optionally, prune the output dense layer (if desired)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].output.dense, name="weight")

    # print(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query.weight)
    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_structure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.6707246376811594, 'f1': 0.7916360968451944}
prun percent 20%
{'accuracy': 0.5907246376811595, 'f1': 0.7080231596360629}
prun percent 30%
{'accuracy': 0.5124637681159421, 'f1': 0.5552617662612375}
prun percent 40%
{'accuracy': 0.5669565217391305, 'f1': 0.6633618747183416}
prun percent 50%
{'accuracy': 0.5837681159420289, 'f1': 0.6973018549747049}
prun percent 60%
{'accuracy': 0.6202898550724638, 'f1': 0.7265135699373695}
prun percent 70%
{'accuracy': 0.6655072463768116, 'f1': 0.7990247300592128}
prun percent 80%
{'accuracy': 0.664927536231884, 'f1': 0.7987465181058496}
prun percent 90%
{'accuracy': 0.664927536231884, 'f1': 0.7987465181058496}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.7916360968451944,
  0.7080231596360629,
  0.5552617662612375,
  0.6633618747183416,
  0.6973018549747049,
  0.7265135699373695,
  0.7990247300592128,
  0.7987465181058496,
  0.7987465181058496],
 'cuda time': [5.699037790298462,
  5.705916404724121,
  5.637936353683472,
  5.716999292373657,
  6.346923828125,
  5.590200901031494,
  5.57944655418396,
  5.609919786453247,
  5.4983556270599365],
 'cpu time': [65.66443920135498,
  65.95729923248291,
  68.50572896003723,
  69.47684454917908,
  66.88780951499939,
  68.19040107727051,
  66.8096559047699,
  66.91059947013855,
  66.54937934875488],
 'accuracy': [0.6707246376811594,
  0.5907246376811595,
  0.5124637681159421,
  0.5669565217391305,
  0.5837681159420289,
  0.6202898550724638,
  0.6655072463768116,
  0.664927536231884,
  0.664927536231884],
 'type': ['ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
 

In [24]:
import json
with open("results/bert_mprc_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


In [25]:
# prun_data

### Flash Attention 

In [None]:
import time
import torch
from evaluate import load

model_sdpa = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc" ,attn_implementation="sdpa")
metric = load("glue","mrpc")

device = "cpu"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



In [None]:

device = "cuda"
metric = load("glue","mrpc")

model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2
size_in_mb = get_model_size(model_qat)
res2["size"] = size_in_mb
res2

In [None]:
import json
import pickle
with open("results/bert_mrpc_sdpa.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_sdpa.pkl", "wb") as f:
#     pickle.dump(model_sdpa, f)

In [31]:
model_eager = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc" ,attn_implementation="eager")

device = "cpu"
model_eager.to(device)
metric = load("glue","mrpc")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



{'accuracy': 0.8307246376811595,
 'f1': 0.87943848059455,
 'cpu time': 67.77916312217712}

In [32]:

device = "cuda"
model_eager.to(device)
metric = load("glue","mrpc")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2


  self.gen = func(*args, **kwds)


{'accuracy': 0.8307246376811595,
 'f1': 0.87943848059455,
 'cpu time': 67.77916312217712,
 'cuda time': 6.1416239738464355}

In [33]:
import json
with open("results/bert_mrpc_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)