In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune GPT cola

Tutorial : https://huggingface.co/docs/transformers/training

In [2]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification

# Load tokenizer and GPT2 model with sequence classification head
tokenizer = GPT2Tokenizer.from_pretrained('PavanNeerudu/gpt2-finetuned-cola')
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2,torch_dtype="auto") 
text = "The inspector analyzed the soundness in the building."
encoded_input = tokenizer(text, return_tensors='pt')
model.config.pad_token_id = tokenizer.eos_token_id
output = model(**encoded_input)
# output

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [3]:
# from transformers import GPT2Tokenizer, GPT2ForSequenceClassification,AutoModelForSequenceClassification
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
# text = "The inspector analyzed the soundness in the building."
# encoded_input = tokenizer(text, return_tensors='pt')

# output = model(**encoded_input)
# # output

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","cola")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["sentence"],truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer)



model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})


In [5]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))


In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
# train_dataloader = DataLoader(
#     small_train_dataset, shuffle = True, batch_size=8, collate_fn = data_collator
# )
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)


In [7]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr=5e-5)




In [8]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [9]:
# from transformers import get_scheduler

# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )

In [10]:
# from tqdm.auto import tqdm

# progress_bar = tqdm(range(num_training_steps))
# device = "cpu"
# model.to(device)
# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [11]:
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

In [14]:
from evaluate import load
import time
import os

cp = 1
gp = 1
metric = load("glue",config_name="mrpc")
res = {}
if cp:
    device = "cpu"
    model.eval()
    # i=0
    model.to(device)

    start = time.time()
    for batch in eval_dataloader:

        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cpu_time = end-start
    met = metric.compute()
    res["f1"]=met["f1"]
    res["accuracy"]=met["accuracy"]
else:
    cpu_time = None
    
if gp:
    device = "cuda"
    model.to(device)
    model.eval()
    
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        encoded_input = {k:batch[k] for k in ['input_ids', 'attention_mask']}
        with torch.no_grad():
            outputs = model(**encoded_input)
            
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    cuda_time = end-start
    if not cp:
        met = metric.compute()
        res["f1"]=met["f1"]
        res["accuracy"]=met["accuracy"]
else:
    cuda_time = None
res[f"cpu time"] = cpu_time
res[f"cuda time"] = cuda_time
res[f"size"] = get_model_size(model)

res

{'f1': 0.8555417185554172,
 'accuracy': 0.7775647171620326,
 'cpu time': 12.731175661087036,
 'cuda time': 3.880117177963257,
 'size': 486.7061004638672}

{'matthews_correlation': np.float64(0.43544209947737617)}

In [15]:
import pickle
# with open("./models/gpt2_cola.pkl", "wb") as f:
#     pickle.dump(model, f)


In [16]:
import json
with open("results/gpt2_cola.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [17]:
import torch
model = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2) 

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [18]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_int8(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.7746883988494727,
 'f1': 0.848093083387201,
 'cpu time': 12.549953937530518}

In [19]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.7746883988494727,
 'f1': 0.848093083387201,
 'cpu time': 12.549953937530518,
 'cuda time': None,
 'size': 486.7002410888672}

In [20]:
import json
with open("results/gpt2_cola_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/gpt2_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [21]:
device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [22]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():

        outputs = model_dynamic_quantized_float16(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.7775647171620326,
 'f1': 0.8555417185554172,
 'cpu time': 12.764209270477295}

In [23]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.7775647171620326,
 'f1': 0.8555417185554172,
 'cpu time': 12.764209270477295,
 'cuda time': None,
 'size': 486.7002410888672}

In [24]:
import json
with open("results/gpt2_cola_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/gpt2_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [25]:
import torch.nn.utils.prune as prune
model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2) 

# model.bert.embeddings

In [26]:
model.transformer.h[0].attn.c_attn


Conv1D(nf=2304, nx=768)

In [27]:
model.transformer.h[0].attn.c_attn
model.transformer.h[0].attn.c_proj
model.transformer.h[0].mlp.c_fc
model.transformer.h[0].mlp.c_proj

Conv1D(nf=768, nx=3072)

In [28]:
from evaluate import load
import time
import os
import torch

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_unstructure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2) 
    amt = i/10
    prune.l1_unstructured(model_prun_unstructure.transformer.wte, name="weight", amount=amt)
    prune.l1_unstructured(model_prun_unstructure.transformer.wpe, name="weight", amount=amt)

    for layer_idx in range(12):
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt)
        prune.l1_unstructured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt)

        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    prune.l1_unstructured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt)

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%
{'accuracy': 0.7727708533077661, 'f1': 0.8493324856961221}
prun percent 20%
{'accuracy': 0.7737296260786194, 'f1': 0.8451443569553806}
prun percent 30%
{'accuracy': 0.716203259827421, 'f1': 0.7764350453172205}
prun percent 40%
{'accuracy': 0.34132310642377756, 'f1': 0.10894941634241245}
prun percent 50%
{'accuracy': 0.3087248322147651, 'f1': 0.0027662517289073307}
prun percent 60%
{'accuracy': 0.3288590604026846, 'f1': 0.08376963350785341}
prun percent 70%
{'accuracy': 0.42665388302972196, 'f1': 0.4369114877589454}
prun percent 80%
{'accuracy': 0.311601150527325, 'f1': 0.008287292817679558}
prun percent 90%
{'accuracy': 0.3087248322147651, 'f1': 0.0}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.8493324856961221,
  0.8451443569553806,
  0.7764350453172205,
  0.10894941634241245,
  0.0027662517289073307,
  0.08376963350785341,
  0.4369114877589454,
  0.008287292817679558,
  0.0],
 'cuda time': [2.058180809020996,
  1.9286153316497803,
  3.3593568801879883,
  3.1907591819763184,
  3.2097573280334473,
  2.3099653720855713,
  3.252368688583374,
  3.1319401264190674,
  3.0337045192718506],
 'cpu time': [16.185962438583374,
  17.00124979019165,
  17.301599502563477,
  16.694705486297607,
  16.470436811447144,
  15.371254205703735,
  16.349833726882935,
  15.964494705200195,
  16.079296588897705],
 'accuracy': [0.7727708533077661,
  0.7737296260786194,
  0.716203259827421,
  0.34132310642377756,
  0.3087248322147651,
  0.3288590604026846,
  0.42665388302972196,
  0.311601150527325,
  0.3087248322147651],
 'type': ['unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructure',
  'unstructu

In [29]:
import json
with open("results/gpt2_cola_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")


##### Prun structure 


In [30]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,10):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2) 
    amt = i/10

    # print(model_prun_structure.bert.encoder.layer[layer_idx].attention.self.query.weight)
    
    prune.ln_structured(model_prun_structure.transformer.wte, name="weight", amount=amt,n=1,dim=0)
    prune.ln_structured(model_prun_structure.transformer.wpe, name="weight", amount=amt,n=1,dim=0)

    for layer_idx in range(12):
        # Access attention layers (query, key, value)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight", amount=amt,n=1,dim=0)
        prune.ln_structured(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_attn, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].attn.c_proj, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_fc, name="bias", amount=amt,n=1,dim=0)
        # prune.ln_structured(model_prun_unstructure.transformer.h[layer_idx].mlp.c_proj, name="bias", amount=amt,n=1,dim=0)

        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_attn, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].attn.c_proj, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_fc, name="weight")
        prune.remove(model_prun_structure.transformer.h[layer_idx].mlp.c_proj, name="weight")

    # prune.ln_structured(model_prun_unstructure.transformer.ln_f, name="weight", amount=amt,n=1,dim=0)

    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_structure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    device = "cpu"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            outputs = model_prun_unstructure(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.3096836049856184, 'f1': 0.002770083102493075}
prun percent 20%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 30%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 40%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 50%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 60%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 70%
{'accuracy': 0.41227229146692235, 'f1': 0.3789260385005066}
prun percent 80%
{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 90%
{'accuracy': 0.3087248322147651, 'f1': 0.0}


{'percent': [10, 20, 30, 40, 50, 60, 70, 80, 90],
 'f1': [0.002770083102493075,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3789260385005066,
  0.0,
  0.0],
 'cuda time': [2.8374617099761963,
  2.929257869720459,
  2.9206697940826416,
  2.7994093894958496,
  3.038015365600586,
  3.015244960784912,
  2.8812475204467773,
  2.7551920413970947,
  3.0102686882019043],
 'cpu time': [16.874932289123535,
  17.035096168518066,
  16.986795663833618,
  16.74958300590515,
  17.316445112228394,
  16.89370632171631,
  17.10787296295166,
  17.24095106124878,
  16.980558395385742],
 'accuracy': [0.3096836049856184,
  0.3087248322147651,
  0.3087248322147651,
  0.3087248322147651,
  0.3087248322147651,
  0.3087248322147651,
  0.41227229146692235,
  0.3087248322147651,
  0.3087248322147651],
 'type': ['ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure',
  'ln_structure']}

In [31]:
import json
with open("results/gpt2_cola_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")


In [32]:
# prun_data

### Flash Attention 

In [33]:
model_sdpa = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start


device = "cuda"
model_sdpa.to(device)

model_sdpa.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_sdpa(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()

res2["cuda time"] = end - start

res2



  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)


{'accuracy': 0.5431447746883988,
 'f1': 0.5904598195100988,
 'cpu time': 13.992931842803955,
 'cuda time': 2.488251209259033}

In [34]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_sdpa)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_flash_attention.to(device)

# model_flash_attention.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash_attention(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.5431447746883988,
 'f1': 0.5904598195100988,
 'cpu time': 13.992931842803955,
 'cuda time': 2.488251209259033,
 'size': 486.7061004638672}

In [35]:
import json
with open("results/gpt2_cola_sdpa.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_sdpa.pkl", "wb") as f:
#     pickle.dump(model_sdpa, f)

In [36]:
model_eager = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2,attn_implementation="sdpa") 

device = "cpu"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start



device = "cuda"
model_eager.to(device)

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_eager(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2



  self.gen = func(*args, **kwds)


{'accuracy': 0.7775647171620326,
 'f1': 0.8555417185554172,
 'cpu time': 13.519189357757568,
 'cuda time': 2.547826051712036}

In [37]:
# res2["cuda time"] = None
size_in_mb = get_model_size(model_eager)
res2["size"] = size_in_mb
res2

# device = "cuda"
# model_eager.to(device)

# model_eager.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_eager(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2


{'accuracy': 0.7775647171620326,
 'f1': 0.8555417185554172,
 'cpu time': 13.519189357757568,
 'cuda time': 2.547826051712036,
 'size': 486.7061004638672}

In [38]:
import json
with open("results/gpt2_cola_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/bert_cola_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)

In [39]:
model_flash = GPT2ForSequenceClassification.from_pretrained('PavanNeerudu/gpt2-finetuned-cola', num_labels=2,attn_implementation="flash_attention_2") 
model_flash.half()
# device = "cpu"
# model_flash.to(device)

# model_flash.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_flash(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# res2 = metric.compute()
# res2["cpu time"] = end - start



device = "cuda"
model_flash.to(device)

model_flash.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            outputs = model_flash(**batch)
        # print(outputs)
        # break
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2["cuda time"] = end - start


res2


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


{'accuracy': 0.7775647171620326,
 'f1': 0.8555417185554172,
 'cpu time': 13.519189357757568,
 'cuda time': 3.119647264480591,
 'size': 486.7061004638672}

In [40]:
import json
with open("results/gpt2_cola_flash.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)