In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune GPT cola

Tutorial : https://huggingface.co/docs/transformers/training

In [1]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification

# Load tokenizer and GPT2 model with sequence classification head
tokenizer = GPT2Tokenizer.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc')
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,torch_dtype="auto") 
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors='pt')
model.config.pad_token_id = tokenizer.eos_token_id
output = model(**encoded_input)
# output

In [None]:
# from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
# text = "The inspector analyzed the soundness in the building."
# encoded_input = tokenizer(text, return_tensors='pt')

# output = model(**encoded_input)
# # output

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","cola")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["sentence"],truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer)



DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})


In [3]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))


In [3]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
# train_dataloader = DataLoader(
#     small_train_dataset, shuffle = True, batch_size=8, collate_fn = data_collator
# )
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)


In [4]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr=5e-5)




In [5]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [49]:
# from transformers import get_scheduler

# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )

In [None]:
# from tqdm.auto import tqdm

# progress_bar = tqdm(range(num_training_steps))
# device = "cpu"
# model.to(device)
# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [13]:
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

In [25]:
from evaluate import load
import time
import os

cp = 1
gp = 1
metric = load("glue",config_name="mrpc")
res = {}
if cp:
    device = "cpu"
    model.eval()
    # i=0
    model.to(device)

    start = time.time()
    for batch in eval_dataloader:

        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cpu_time = end-start
    met = metric.compute()
    res["f1"]=met["f1"]
    res["accuracy"]=met["accuracy"]
else:
    cpu_time = None
    
if gp:
    device = "cuda"
    model.to(device)
    model.eval()
    
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        encoded_input = {k:batch[k] for k in ['input_ids', 'attention_mask']}
        with torch.no_grad():
            outputs = model(**encoded_input)
            
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cuda_time = end-start
    if not cp:
        met = metric.compute()
        res["f1"]=met["f1"]
        res["accuracy"]=met["accuracy"]
else:
    cuda_time = None
res[f"cpu time"] = cpu_time
res[f"cuda time"] = cuda_time
res[f"size"] = get_model_size(model)

res

{'f1': 0.826540414436668,
 'accuracy': 0.704362062916618,
 'cpu time': 98.90361881256104,
 'cuda time': 13.57767128944397,
 'size': 486.7061004638672}

In [22]:
import pickle
# with open("./models/gpt2_mrpc.pkl", "wb") as f:
#     pickle.dump(model, f)


In [23]:
import json
with open("results/gpt2_mrpc.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [5]:
import torch
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [30]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_int8(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.7029393370856786,
 'f1': 0.8255600440690415,
 'cpu time': 11.698429822921753}

In [31]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.7029393370856786,
 'f1': 0.8255600440690415,
 'cpu time': 11.698429822921753,
 'cuda time': None,
 'size': 486.7002410888672}

In [32]:
import json
with open("results/gpt2_mrpc_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/gpt2_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [33]:
device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [34]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_float16(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 11.756733894348145}

In [35]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 11.756733894348145,
 'cuda time': None,
 'size': 486.7002410888672}

In [36]:
import json
with open("results/gpt2_mrpc_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/gpt2_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [6]:
import torch.nn.utils.prune as prune
model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 

# model.bert.embeddings

In [11]:
model.transformer.h[0].attn.c_attn


Conv1D(nf=2304, nx=768)

In [45]:
model.transformer.h[0].attn.c_attn
model.transformer.h[0].attn.c_proj
model.transformer.h[0].mlp.c_fc
model.transformer.h[0].mlp.c_proj

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [7]:
from evaluate import load
import time
import os
import torch

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 
    amt = i/10
    prune.l1_unstructured(model_prun_unstructure.transformer.wte, name="weight", amount=amt)
    prune.l1_unstructured(model_prun_unstructure.transformer.wpe, name="weight", amount=amt)

    for layer_idx in range(12):
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt)

        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    prune.l1_unstructured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt)

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 20%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 30%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 40%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 50%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 60%
{'accuracy': 0.6807286673058485, 'f1': 0.8080691642651296}
prun percent 70%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 80%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}
prun percent 90%
{'accuracy': 0.6912751677852349, 'f1': 0.8174603174603174}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8174603174603174,
  0.8174603174603174,
  0.8174603174603174,
  0.8174603174603174,
  0.8174603174603174,
  0.8080691642651296,
  0.8174603174603174,
  0.8174603174603174,
  0.8174603174603174],
 'cuda time': [2.7447926998138428,
  3.4886314868927,
  3.293457269668579,
  3.4192774295806885,
  3.3696861267089844,
  3.312985897064209,
  3.0935909748077393,
  3.1403775215148926,
  3.037839651107788],
 'cpu time': [18.81718349456787,
  17.8518967628479,
  22.070685148239136,
  17.866071224212646,
  17.143295764923096,
  16.89777159690857,
  16.774181604385376,
  15.539551973342896,
  15.600200176239014],
 'accuracy': [0.6912751677852349,
  0.6912751677852349,
  0.6912751677852349,
  0.6912751677852349,
  0.6912751677852349,
  0.6807286673058485,
  0.6912751677852349,
  0.6912751677852349,
  0.6912751677852349],
 'type': ['unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstruct

In [8]:
import json
with open("results/gpt2_mrpc_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


##### Prun structure 


In [9]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2) 
    amt = i/10

    # print(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query.weight)
    
    prune.ln_structured(model_prun_structure.transformer.wte, name="weight", amount=amt,n=1,dim=0)
    prune.ln_structured(model_prun_structure.transformer.wpe, name="weight", amount=amt,n=1,dim=0)

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt,n=1,dim=0)

        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    # prune.ln_structured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt,n=1,dim=0)

    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_structure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.6874400767018217, 'f1': 0.8145620022753128}
prun percent 20%
{'accuracy': 0.6558005752636625, 'f1': 0.7882005899705015}
prun percent 30%
{'accuracy': 0.6510067114093959, 'f1': 0.773067331670823}
prun percent 40%
{'accuracy': 0.3096836049856184, 'f1': 0.008264462809917356}
prun percent 50%
{'accuracy': 0.311601150527325, 'f1': 0.008287292817679558}
prun percent 60%
{'accuracy': 0.3096836049856184, 'f1': 0.002770083102493075}
prun percent 70%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 80%
{'accuracy': 0.311601150527325, 'f1': 0.008287292817679558}
prun percent 90%
{'accuracy': 0.3087248322147651, 'f1': 0.0}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8145620022753128,
  0.7882005899705015,
  0.773067331670823,
  0.008264462809917356,
  0.008287292817679558,
  0.002770083102493075,
  0.0,
  0.008287292817679558,
  0.0],
 'cuda time': [2.8779895305633545,
  2.766774892807007,
  2.8566083908081055,
  2.7322964668273926,
  2.8842616081237793,
  2.8876256942749023,
  3.004941701889038,
  2.828578472137451,
  2.759298324584961],
 'cpu time': [15.589048385620117,
  15.770520210266113,
  15.961583614349365,
  15.930171489715576,
  15.857511281967163,
  15.787281513214111,
  15.796875,
  17.08060312271118,
  16.890757083892822],
 'accuracy': [0.6874400767018217,
  0.6558005752636625,
  0.6510067114093959,
  0.3096836049856184,
  0.311601150527325,
  0.3096836049856184,
  0.3087248322147651,
  0.311601150527325,
  0.3087248322147651],
 'type': ['ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure'

In [10]:
import json
with open("results/gpt2_mrpc_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")


In [25]:
# prun_data

### Flash Attention 

In [11]:
model_sdpa = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start


device = "cuda"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()

res2["cuda time"] = end - start

res2



  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)


{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 13.1011962890625,
 'cuda time': 2.553781032562256}

In [14]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_sdpa)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_flash_attention.to(device)

# model_flash_attention.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash_attention(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 13.1011962890625,
 'cuda time': 2.553781032562256,
 'size': 486.7061004638672}

In [15]:
import json
with open("results/gpt2_mrpc_sdpa.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_sdpa.pkl", "wb") as f:
#     pickle.dump(model_sdpa, f)

In [17]:
model_eager = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start



device = "cuda"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2



  self.gen = func(*args, **kwds)


{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 11.96101689338684,
 'cuda time': 2.392256498336792}

In [18]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_eager)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_eager.to(device)

# model_eager.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_eager(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 11.96101689338684,
 'cuda time': 2.392256498336792,
 'size': 486.7061004638672}

In [19]:
import json
with open("results/gpt2_mrpc_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_mrpc_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)

In [28]:
model_flash = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-mrpc', num_labels=2,attn_implementation="flash_attention_2") 
model_flash.half()
# device = "cpu"
# model_flash.to(device)

# model_flash.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# res2 = metric.compute()
# res2["cpu time"] = end - start



device = "cuda"
model_flash.to(device)

model_flash.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_flash(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2


{'accuracy': 0.6912751677852349,
 'f1': 0.8174603174603174,
 'cpu time': 11.96101689338684,
 'cuda time': 2.8964767456054688,
 'size': 486.7061004638672}

In [29]:
import json
with open("results/gpt2_mrpc_flash.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)