In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune T5 MRPC

Tutorial : https://huggingface.co/docs/transformers/training

In [14]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('PavanNeerudu/t5-base-finetuned-mrpc')
model = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc")



In [15]:
text = "mrpc sentence1: " + "testing" + "sentence 2: " + "checking"
encoded_input = tokenizer(text, return_tensors='pt')
generated_ids = model.generate(**encoded_input)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_ids[0])




tensor([   0, 7072,    1])


In [16]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","mrpc")
tokenizer = T5Tokenizer.from_pretrained('PavanNeerudu/t5-base-finetuned-mrpc')

def tokenize_function(examples):
    text_list = ["mrpc sentence1: " + examples["sentence1"][i]+ "sentence 2: "+examples["sentence2"][i] for i in range(len(examples["sentence1"]))]
    return tokenizer(text_list, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1","sentence2","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)



DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [17]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn = data_collator
)


In [5]:
# from transformers import AdamW
# optimizer = AdamW(model.parameters(),lr=5e-5)




In [18]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [40]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # encode_input = {}

    # for i in ['input_ids', 'token_type_ids', 'attention_mask']:
    #     encode_input[i] = batch[i].to(device)
    with torch.no_grad():
        generated_ids = model.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res = metric.compute()
res[f"{device} time"] = end-start
res


{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cuda time': 22.146294116973877}

In [34]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

device = torch.device("cpu")
model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res[f"cpu time"] = end-start

res



{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cuda time': 23.378018856048584,
 'cpu time': 159.26379823684692}

In [35]:
import pickle
# with open("./models/t5_mrpc.pkl", "wb") as f:
#     pickle.dump(model, f)


In [19]:
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB


In [6]:
import torch
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

size_in_mb = get_model_size(model)
res["size"] = size_in_mb
res

NameError: name 'res' is not defined

In [37]:
import json
with open("results/t5_mrpc.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [44]:
model = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc")

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [45]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model_dynamic_quantized_int8.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.8525482434438397,
 'f1': 0.8845855925639039,
 'cpu time': 90.7253794670105}

In [46]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8525482434438397,
 'f1': 0.8845855925639039,
 'cpu time': 90.7253794670105,
 'cuda time': None,
 'size': 94.3095703125}

In [47]:
import json
with open("results/t5_mrpc_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")

# with open("./models/bert_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [48]:

device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [50]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model_dynamic_quantized_float16.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cpu time': 135.09448504447937}

In [51]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cpu time': 135.09448504447937,
 'cuda time': None,
 'size': 94.3095703125}

In [52]:
import json
with open("results/t5_mrpc_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")

# with open("./models/bert_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [20]:
import torch.nn.utils.prune as prune
model_prun_unstructure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc")
# model.bert.embeddings

In [54]:
model_prun_unstructure

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [10]:
from evaluate import load
import time
import os

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,4):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")
    pruning_amount=i/10
    model_prun_unstructure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc")
    
    for name, module in model_prun_unstructure.named_modules():
        if hasattr(module, "weight"):  # Check if the module has a weight parameter
            prune.l1_unstructured(module, name="weight", amount=pruning_amount)
            prune.remove(module, name="weight")

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            generated_ids = model_prun_unstructure.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
            # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
            # print(output_text)
        
        # logits = outputs.logits
        # predictions = torch.argmax(logits, dim=-1)
        predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
        
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    # device = "cpu"
    # model_prun_unstructure.to(device)
    
    # model_prun_unstructure.eval()
    # start = time.time()
    # for batch in eval_dataloader:
    #     batch = {k: v.to(device) for k, v in batch.items()}
    #     with torch.no_grad():
    
    #         generated_ids = model_prun_unstructure.generate(**batch)
    #         output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    #         # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
    #         # print(output_text)
        
    #     # logits = outputs.logits
    #     # predictions = torch.argmax(logits, dim=-1)
    #     predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
        
    #     metric.add_batch(predictions=predictions, references=batch["labels"])
    # end = time.time()
    # prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%




model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

{'accuracy': 0.7501449275362319, 'f1': 0.8088691796008869}
prun percent 20%




{'accuracy': 0.392463768115942, 'f1': 0.1838006230529595}
prun percent 30%




{'accuracy': 0.33507246376811595, 'f1': 0.0}


{'percent': [10, 20, 30],
 'f1': [0.8088691796008869, 0.1838006230529595, 0.0],
 'cuda time': [24.78835391998291, 27.711055517196655, 102.74153995513916],
 'cpu time': [],
 'accuracy': [0.7501449275362319, 0.392463768115942, 0.33507246376811595],
 'type': ['unstructure', 'unstructure', 'unstructure']}

In [None]:
prun_data

In [11]:
import json
with open("results/t5_mrpc_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")


##### Prun structure 


In [15]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,4):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc")
    amt = i/10

    for name, module in model_prun_structure.named_modules():
        if hasattr(module, "weight") and module.weight.ndim > 1:  # Check if the module has a weight parameter
            prune.ln_structured(module, name="weight", amount=amt, n=2,dim=0)
            prune.remove(module, name="weight")

    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            generated_ids = model_prun_structure.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
            # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
            # print(output_text)
        
        # logits = outputs.logits
        # predictions = torch.argmax(logits, dim=-1)
        predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
        
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    
    # device = "cpu"
    # model_prun_structure.to(device)
    
    # model_prun_structure.eval()
    # start = time.time()
    # for batch in eval_dataloader:
    #     batch = {k: v.to(device) for k, v in batch.items()}
    #     with torch.no_grad():
    
    #         generated_ids = model_prun_structure.generate(**batch)
    #         output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    #         # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
    #         # print(output_text)
        
    #     # logits = outputs.logits
    #     # predictions = torch.argmax(logits, dim=-1)
    #     predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
        
    #     metric.add_batch(predictions=predictions, references=batch["labels"])
    # end = time.time()
    # prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%
{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 20%




{'accuracy': 0.33507246376811595, 'f1': 0.0}
prun percent 30%




{'accuracy': 0.33507246376811595, 'f1': 0.0}


{'percent': [10, 20, 30],
 'f1': [0.0, 0.0, 0.0],
 'cuda time': [12.68371295928955, 32.76889371871948, 72.02653193473816],
 'cpu time': [],
 'accuracy': [0.33507246376811595, 0.33507246376811595, 0.33507246376811595],
 'type': ['ln_structure', 'ln_structure', 'ln_structure']}

In [None]:
import json
with open("results/bert_mprc_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")


In [25]:
# prun_data

### Flash Attention 

In [22]:
import time
import torch
from evaluate import load

# model_sdpa = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc" ,attn_implementation="sdpa")
# metric = load("glue","mrpc")

# device = "cpu"
# model_sdpa.to(device)

# model_sdpa.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_sdpa(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# res2 = metric.compute()
# res2["cpu time"] = end - start
# res2



In [35]:

# device = "cuda"
# metric = load("glue","mrpc")

# model_sdpa.to(device)

# model_sdpa.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_sdpa(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2
# size_in_mb = get_model_size(model_sdpa)
# res2["size"] = size_in_mb
# res2

  self.gen = func(*args, **kwds)


NameError: name 'model_qat' is not defined

In [None]:
# import json
# import pickle
# with open("results/t5_mrpc_sdpa.json", "w") as json_file:
#     json.dump(res2, json_file, indent=4)
# # torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")

# # with open("./models/bert_sdpa.pkl", "wb") as f:
# #     pickle.dump(model_sdpa, f)

In [10]:
model_eager = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-mrpc" ,attn_implementation="eager")

device = "cpu"
model_eager.to(device)
metric = load("glue","mrpc")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            
            generated_ids = model_eager.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
        
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2





{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cpu time': 142.18140983581543}

In [12]:

device = "cuda"
model_eager.to(device)
metric = load("glue","mrpc")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
 
            generated_ids = model_eager.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    predictions = [1 if output_text[i] == "equivalent" else 0 for i in range(len(output_text))] 
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2


  self.gen = func(*args, **kwds)


{'accuracy': 0.8597101449275363,
 'f1': 0.8976311336717429,
 'cpu time': 142.18140983581543,
 'cuda time': 19.224152326583862}

In [13]:
import json
with open("results/t5_mrpc_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_mrpc_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)