In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune Bert cola

Tutorial : https://huggingface.co/docs/transformers/training

In [2]:
from transformers import BertTokenizer, BertModel,BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('pmthangk09/bert-base-uncased-glue-cola')
model = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola")
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","cola")
tokenizer = BertTokenizer.from_pretrained('pmthangk09/bert-base-uncased-glue-cola')

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)



Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1063
    })
})


In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)


In [5]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr=5e-5)




In [6]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [7]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # encode_input = {}

    # for i in ['input_ids', 'token_type_ids', 'attention_mask']:
    #     encode_input[i] = batch[i].to(device)
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res = metric.compute()
res[f"{device} time"] = end-start
res


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cuda time': 1.8768351078033447}

In [8]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

device = torch.device("cpu")
model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res[f"cpu time"] = end-start

res

{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cuda time': 1.8768351078033447,
 'cpu time': 11.699313879013062}

In [9]:
import pickle
# with open("./models/bert_cola.pkl", "wb") as f:
#     pickle.dump(model, f)


In [10]:
import torch
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

size_in_mb = get_model_size(model)
res["size"] = size_in_mb
res

{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cuda time': 1.8768351078033447,
 'cpu time': 11.699313879013062,
 'size': 417.65528106689453}

In [11]:
import json
with open("results/bert_cola.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [12]:
model = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola")

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [13]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_int8(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.824065196548418,
 'f1': 0.881498224087827,
 'cpu time': 7.190145492553711}

In [14]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.824065196548418,
 'f1': 0.881498224087827,
 'cpu time': 7.190145492553711,
 'cuda time': None,
 'size': 91.080078125}

In [15]:
import json
with open("results/bert_cola_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [16]:

device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [17]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_float16(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 11.00797176361084}

In [18]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 11.00797176361084,
 'cuda time': None,
 'size': 91.080078125}

In [19]:
import json
with open("results/bert_cola_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [20]:
import torch.nn.utils.prune as prune
model_prun_unstructure = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola")
# model.bert.embeddings

In [21]:
from evaluate import load
import time
import os

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_unstructure = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola")

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        amt = i/10
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.query, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.key, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.value, name="weight", amount=amt)
        
        # Access feed-forward layers (intermediate dense layer)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight", amount=amt)
        
        # Optionally, prune the output dense layer (if desired)
        prune.l1_unstructured(model_prun_unstructure.bert.encoder.layer[layer_idx].output.dense, name="weight", amount=amt)
    
    
        # Access attention layers (query, key, value)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.query, name="weight")
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.key, name="weight")
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].attention.self.value, name="weight")
        
        # Access feed-forward layers (intermediate dense layer)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight")
        
        # Optionally, prune the output dense layer (if desired)
        prune.remove(model_prun_unstructure.bert.encoder.layer[layer_idx].output.dense, name="weight")

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%
{'accuracy': 0.8293384467881112, 'f1': 0.8853092783505154}
prun percent 20%
{'accuracy': 0.822627037392138, 'f1': 0.880722114764668}
prun percent 30%
{'accuracy': 0.8293384467881112, 'f1': 0.8850129198966409}
prun percent 40%
{'accuracy': 0.8216682646212847, 'f1': 0.8785900783289817}
prun percent 50%
{'accuracy': 0.7267497603068073, 'f1': 0.7806004618937644}
prun percent 60%
{'accuracy': 0.436241610738255, 'f1': 0.3495575221238938}
prun percent 70%
{'accuracy': 0.3700862895493768, 'f1': 0.2351571594877765}
prun percent 80%
{'accuracy': 0.40651965484180247, 'f1': 0.3913470993117011}
prun percent 90%
{'accuracy': 0.6883988494726749, 'f1': 0.8154457694491766}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8853092783505154,
  0.880722114764668,
  0.8850129198966409,
  0.8785900783289817,
  0.7806004618937644,
  0.3495575221238938,
  0.2351571594877765,
  0.3913470993117011,
  0.8154457694491766],
 'cuda time': [1.6929144859313965,
  2.6742303371429443,
  2.776714324951172,
  2.7129549980163574,
  2.696964740753174,
  2.504974842071533,
  2.6727609634399414,
  2.817477226257324,
  3.0889394283294678],
 'cpu time': [11.906315803527832,
  13.240327596664429,
  12.857508897781372,
  13.130394220352173,
  13.263718605041504,
  12.832561731338501,
  13.317347049713135,
  13.237804889678955,
  18.374518156051636],
 'accuracy': [0.8293384467881112,
  0.822627037392138,
  0.8293384467881112,
  0.8216682646212847,
  0.7267497603068073,
  0.436241610738255,
  0.3700862895493768,
  0.40651965484180247,
  0.6883988494726749],
 'type': ['unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unst

In [22]:
import json
with open("results/bert_cola_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")


##### Prun structure 


In [23]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola")
    amt = i/10

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query, name="weight", amount=amt,n=2,dim=0)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.key, name="weight", amount=amt,n=2,dim=0)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.value, name="weight", amount=amt,n=2,dim=0)
        
        # Access feed-forward layers (intermediate dense layer)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight", amount=amt,n=2,dim=0)
        
        # Optionally, prune the output dense layer (if desired)
        prune.ln_structured(model_prun_structure.bert.encoder.layer[layer_idx].output.dense, name="weight", amount=amt,n=2,dim=0)
    
    
        # Access attention layers (query, key, value)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query, name="weight")
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.key, name="weight")
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.value, name="weight")
        
        # Access feed-forward layers (intermediate dense layer)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].intermediate.dense, name="weight")
        
        # Optionally, prune the output dense layer (if desired)
        prune.remove(model_prun_structure.bert.encoder.layer[layer_idx].output.dense, name="weight")

    # print(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query.weight)
    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_structure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.6903163950143816, 'f1': 0.8167895632444696}
prun percent 20%
{'accuracy': 0.6653883029721956, 'f1': 0.7967384973791497}
prun percent 30%
{'accuracy': 0.6903163950143816, 'f1': 0.816372939169983}
prun percent 40%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 50%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 60%
{'accuracy': 0.6903163950143816, 'f1': 0.8167895632444696}
prun percent 70%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 80%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 90%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8167895632444696,
  0.7967384973791497,
  0.816372939169983,
  0.8174603174603174,
  0.8174603174603174,
  0.8167895632444696,
  0.8174603174603174,
  0.8174603174603174,
  0.8174603174603174],
 'cuda time': [4.134546518325806,
  2.836993455886841,
  2.8719277381896973,
  2.8896429538726807,
  2.6702089309692383,
  4.033512353897095,
  2.570136070251465,
  2.840125560760498,
  2.6193416118621826],
 'cpu time': [16.202006340026855,
  17.76843285560608,
  15.903268575668335,
  15.23781418800354,
  14.755957841873169,
  15.233280420303345,
  13.369174242019653,
  13.28218388557434,
  13.610204935073853],
 'accuracy': [0.6903163950143816,
  0.6653883029721956,
  0.6903163950143816,
  0.6912751677852349,
  0.6912751677852349,
  0.6903163950143816,
  0.6912751677852349,
  0.6912751677852349,
  0.6912751677852349],
 'type': ['ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'l

In [1]:
# prun_data = {'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
#  'f1': [0.8167895632444696,
#   0.7967384973791497,
#   0.816372939169983,
#   0.8174603174603174,
#   0.8174603174603174,
#   0.8167895632444696,
#   0.8174603174603174,
#   0.8174603174603174,
#   0.8174603174603174],
#  'cuda time': [4.134546518325806,
#   2.836993455886841,
#   2.8719277381896973,
#   2.8896429538726807,
#   2.6702089309692383,
#   4.033512353897095,
#   2.570136070251465,
#   2.840125560760498,
#   2.6193416118621826],
#  'cpu time': [16.202006340026855,
#   17.76843285560608,
#   15.903268575668335,
#   15.23781418800354,
#   14.755957841873169,
#   15.233280420303345,
#   13.369174242019653,
#   13.28218388557434,
#   13.610204935073853],
#  'accuracy': [0.6903163950143816,
#   0.6653883029721956,
#   0.6903163950143816,
#   0.6912751677852349,
#   0.6912751677852349,
#   0.6903163950143816,
#   0.6912751677852349,
#   0.6912751677852349,
#   0.6912751677852349],
#  'type': ['ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure',
#   'ln_structure']}


In [2]:
import json
with open("results/bert_cola_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")


In [25]:
# prun_data

### Flash Attention 

In [28]:
import time
import torch
from evaluate import load

model_sdpa = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola" ,attn_implementation="sdpa")
metric = load("glue","mrpc")

device = "cpu"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 13.959874153137207}

In [29]:

device = "cuda"
metric = load("glue","cola")

model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2
size_in_mb = get_model_size(model_sdpa)
res2["size"] = size_in_mb
res2

  self.gen = func(*args, **kwds)


{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 13.959874153137207,
 'cuda time': 2.6849753856658936,
 'size': 417.65528106689453}

In [30]:
import json
import pickle
with open("results/bert_cola_sdpa.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_sdpa.pkl", "wb") as f:
#     pickle.dump(model_sdpa, f)

In [31]:
model_eager = BertForSequenceClassification.from_pretrained("pmthangk09/bert-base-uncased-glue-cola" ,attn_implementation="eager")

device = "cpu"
model_eager.to(device)
metric = load("glue","mrpc")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 14.042388916015625}

In [32]:

device = "cuda"
model_eager.to(device)
metric = load("glue","cola")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2


  self.gen = func(*args, **kwds)


{'accuracy': 0.8283796740172579,
 'f1': 0.8844415752098128,
 'cpu time': 14.042388916015625,
 'cuda time': 2.837186574935913}

In [33]:
import json
with open("results/bert_cola_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)