In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune GPT mrpc

Tutorial : https://huggingface.co/docs/transformers/training

In [1]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification

# Load tokenizer and GPT2 model with sequence classification head
tokenizer = GPT2Tokenizer.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc')
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,torch_dtype="auto") 
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors='pt')
model.config.pad_token_id = tokenizer.eos_token_id
output = model(**encoded_input)
# output

In [3]:
# from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
# text = "The inspector analyzed the soundness in the building."
# encoded_input = tokenizer(text, return_tensors='pt')

# output = model(**encoded_input)
# # output

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","mrpc")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    # print(examples.keys())
    return tokenizer(examples["sentence1"],examples["sentence2"],truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1","sentence2","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer)



DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [5]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))


In [3]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
# train_dataloader = DataLoader(
#     small_train_dataset, shuffle = True, batch_size=8, collate_fn = data_collator
# )
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn = data_collator
)


In [4]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr=5e-5)




In [5]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [9]:
# from transformers import get_scheduler

# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )

In [10]:
# from tqdm.auto import tqdm

# progress_bar = tqdm(range(num_training_steps))
# device = "cpu"
# model.to(device)
# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [6]:
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

In [12]:
from evaluate import load
import time
import os

cp = 1
gp = 1
metric = load("glue",config_name="mrpc")
res = {}
if cp:
    device = "cpu"
    model.eval()
    # i=0
    model.to(device)

    start = time.time()
    for batch in eval_dataloader:

        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cpu_time = end-start
    met = metric.compute()
    res["f1"]=met["f1"]
    res["accuracy"]=met["accuracy"]
else:
    cpu_time = None
    
if gp:
    device = "cuda"
    model.to(device)
    model.eval()
    
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        encoded_input = {k:batch[k] for k in ['input_ids', 'attention_mask']}
        with torch.no_grad():
            outputs = model(**encoded_input)
            
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cuda_time = end-start
    if not cp:
        met = metric.compute()
        res["f1"]=met["f1"]
        res["accuracy"]=met["accuracy"]
else:
    cuda_time = None
res[f"cpu time"] = cpu_time
res[f"cuda time"] = cuda_time
res[f"size"] = get_model_size(model)

res

{'f1': 0.7591026112541376,
 'accuracy': 0.6202898550724638,
 'cpu time': 118.14539122581482,
 'cuda time': 8.824100017547607,
 'size': 486.7061004638672}

In [7]:
import pickle
# with open("./models/gpt2_mrpc.pkl", "wb") as f:
#     pickle.dump(model, f)


In [14]:
import json
with open("results/gpt2_mrpc.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [15]:
import torch
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [16]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_int8(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.621159420289855,
 'f1': 0.7599632690541781,
 'cpu time': 111.0577871799469}

In [17]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.621159420289855,
 'f1': 0.7599632690541781,
 'cpu time': 111.0577871799469,
 'cuda time': None,
 'size': 486.7002410888672}

In [18]:
import json
with open("results/gpt2_mrpc_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/gpt2_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [19]:
device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [20]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_float16(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.6202898550724638,
 'f1': 0.7591026112541376,
 'cpu time': 107.34287166595459}

In [21]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.6202898550724638,
 'f1': 0.7591026112541376,
 'cpu time': 107.34287166595459,
 'cuda time': None,
 'size': 486.7002410888672}

In [22]:
import json
with open("results/gpt2_mrpc_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/gpt2_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [8]:
import torch.nn.utils.prune as prune
model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 

# model.bert.embeddings

In [9]:
model.transformer.h[0].attn.c_attn


Conv1D(nf=2304, nx=768)

In [10]:
model.transformer.h[0].attn.c_attn
model.transformer.h[0].attn.c_proj
model.transformer.h[0].mlp.c_fc
model.transformer.h[0].mlp.c_proj

Conv1D(nf=768, nx=3072)

In [11]:
from evaluate import load
import time
import os
import torch

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 
    amt = i/10
    prune.l1_unstructured(model_prun_unstructure.transformer.wte, name="weight", amount=amt)
    prune.l1_unstructured(model_prun_unstructure.transformer.wpe, name="weight", amount=amt)

    for layer_idx in range(12):
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt)

        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    prune.l1_unstructured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt)

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%
{'accuracy': 0.6255072463768115, 'f1': 0.7626745040411462}
prun percent 20%
{'accuracy': 0.6347826086956522, 'f1': 0.7719044170890659}
prun percent 30%
{'accuracy': 0.6614492753623188, 'f1': 0.7917261055634808}
prun percent 40%
{'accuracy': 0.6823188405797102, 'f1': 0.8007272727272727}
prun percent 50%
{'accuracy': 0.6672463768115942, 'f1': 0.7997208653175157}
prun percent 60%
{'accuracy': 0.5228985507246376, 'f1': 0.5742369374030005}
prun percent 70%
{'accuracy': 0.6643478260869565, 'f1': 0.7976232086682978}
prun percent 80%
{'accuracy': 0.6608695652173913, 'f1': 0.7950963222416813}
prun percent 90%
{'accuracy': 0.664927536231884, 'f1': 0.7987465181058496}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.7626745040411462,
  0.7719044170890659,
  0.7917261055634808,
  0.8007272727272727,
  0.7997208653175157,
  0.5742369374030005,
  0.7976232086682978,
  0.7950963222416813,
  0.7987465181058496],
 'cuda time': [11.368586778640747,
  12.052385568618774,
  12.01824426651001,
  12.015639781951904,
  11.878888845443726,
  11.771916389465332,
  11.626808166503906,
  11.434561252593994,
  11.254689455032349],
 'cpu time': [78.88730478286743,
  81.01330900192261,
  81.42294549942017,
  85.88344049453735,
  85.4019193649292,
  80.67635560035706,
  80.66290831565857,
  84.56308674812317,
  83.76370739936829],
 'accuracy': [0.6255072463768115,
  0.6347826086956522,
  0.6614492753623188,
  0.6823188405797102,
  0.6672463768115942,
  0.5228985507246376,
  0.6643478260869565,
  0.6608695652173913,
  0.664927536231884],
 'type': ['unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructur

In [12]:
import json
with open("results/gpt2_mrpc_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


##### Prun structure 


In [13]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 
    amt = i/10

    # print(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query.weight)
    
    prune.ln_structured(model_prun_structure.transformer.wte, name="weight", amount=amt,n=1,dim=0)
    prune.ln_structured(model_prun_structure.transformer.wpe, name="weight", amount=amt,n=1,dim=0)

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt,n=1,dim=0)

        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    # prune.ln_structured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt,n=1,dim=0)

    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_structure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.6591304347826087, 'f1': 0.7938288920056101}
prun percent 20%
{'accuracy': 0.6382608695652174, 'f1': 0.773090909090909}
prun percent 30%
{'accuracy': 0.6469565217391304, 'f1': 0.7751937984496124}
prun percent 40%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 50%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 60%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 70%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 80%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 90%
{'accuracy': 0.33507246376811595, 'f1': 0.0}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.7938288920056101,
  0.773090909090909,
  0.7751937984496124,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'cuda time': [11.851957559585571,
  11.783653736114502,
  11.686673402786255,
  11.566436052322388,
  11.456430912017822,
  6.491225242614746,
  6.416776180267334,
  6.347260236740112,
  6.346456527709961],
 'cpu time': [87.80204749107361,
  86.68856310844421,
  85.34108471870422,
  85.26702213287354,
  89.34380006790161,
  88.1209614276886,
  84.40968799591064,
  86.49781703948975,
  84.90899467468262],
 'accuracy': [0.6591304347826087,
  0.6382608695652174,
  0.6469565217391304,
  0.33507246376811595,
  0.33507246376811595,
  0.33507246376811595,
  0.33507246376811595,
  0.33507246376811595,
  0.33507246376811595],
 'type': ['ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure']}

In [14]:
import json
with open("results/gpt2_mrpc_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


In [None]:
# prun_data

### Flash Attention 

In [15]:
model_sdpa = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start


device = "cuda"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()

res2["cuda time"] = end - start

res2



  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)


{'accuracy': 0.6426086956521739,
 'f1': 0.7794670005365767,
 'cpu time': 81.572429895401,
 'cuda time': 6.185762882232666}

In [16]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_sdpa)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_flash_attention.to(device)

# model_flash_attention.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash_attention(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.6426086956521739,
 'f1': 0.7794670005365767,
 'cpu time': 81.572429895401,
 'cuda time': 6.185762882232666,
 'size': 486.7061004638672}

In [17]:
import json
with open("results/gpt2_mrpc_sdpa.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_sdpa.pkl", "wb") as f:
#     pickle.dump(model_sdpa, f)

In [18]:
model_eager = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start



device = "cuda"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2



  self.gen = func(*args, **kwds)


{'accuracy': 0.6202898550724638,
 'f1': 0.7591026112541376,
 'cpu time': 81.90395927429199,
 'cuda time': 6.1522276401519775}

In [19]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_eager)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_eager.to(device)

# model_eager.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_eager(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.6202898550724638,
 'f1': 0.7591026112541376,
 'cpu time': 81.90395927429199,
 'cuda time': 6.1522276401519775,
 'size': 486.7061004638672}

In [20]:
import json
with open("results/gpt2_mrpc_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)

In [25]:
model_flash = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="flash_attention_2") 
model_flash.half()
# device = "cpu"
# model_flash.to(device)

# model_flash.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# res2 = metric.compute()
# res2["cpu time"] = end - start



device = "cuda"
model_flash.to(device)

model_flash.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
    # with torch.inference_mode():
    #     # raise error if no optimized kernel is available
    #     with torch.backends.cuda.sdp_kernel(
    #         enable_flash=True, enable_math=True, enable_mem_efficient=True
    #     ):
        outputs = model_flash(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2


{'accuracy': 0.6202898550724638,
 'f1': 0.7591026112541376,
 'cpu time': 81.90395927429199,
 'cuda time': 5.336472272872925,
 'size': 486.7061004638672}

In [26]:
import json
with open("results/gpt2_mrpc_flash.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)