In [1]:
# !yes y |pip uninstall torch torchvision
# !yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html


# Fine-tune T5 cola

Tutorial : https://huggingface.co/docs/transformers/training

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('PavanNeerudu/t5-base-finetuned-cola')
model = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola")



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
text = "cola sentence: " + "The book on the table is mine."
encoded_input = tokenizer(text, return_tensors='pt')
generated_ids = model.generate(**encoded_input)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_ids[0])






tensor([   0, 9961,    1])


In [5]:
output_text

'acceptable'

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue","cola")
tokenizer = T5Tokenizer.from_pretrained('PavanNeerudu/t5-base-finetuned-cola')

def tokenize_function(examples):
    text_list = ["cola sentence1: " + examples["sentence"][i] for i in range(len(examples["sentence"]))]
    return tokenizer(text_list, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)



model.safetensors:  33%|###2      | 294M/892M [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})


In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn = data_collator
)
train2_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator
)


In [5]:
# from transformers import AdamW
# optimizer = AdamW(model.parameters(),lr=5e-5)




In [5]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cuda


In [11]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # encode_input = {}

    # for i in ['input_ids', 'token_type_ids', 'attention_mask']:
    #     encode_input[i] = batch[i].to(device)
    with torch.no_grad():
        generated_ids = model.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res = metric.compute()
res[f"{device} time"] = end-start
res




{'accuracy': 0.8312559923298178,
 'f1': 0.8828229027962716,
 'cuda time': 11.242987871170044}

In [12]:
from evaluate import load
import time
import os
metric = load("glue",config_name="mrpc")

device = torch.device("cpu")
model.to(device)
model.eval()

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res[f"cpu time"] = end-start

res



{'accuracy': 0.8312559923298178,
 'f1': 0.8828229027962716,
 'cuda time': 11.242987871170044,
 'cpu time': 38.5801739692688}

In [13]:
import pickle
# with open("./models/t5_cola.pkl", "wb") as f:
#     pickle.dump(model, f)


In [6]:

def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB


In [14]:
import torch
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size  # Total size in bytes
    return total_size / (1024 ** 2)  # Convert to MB

size_in_mb = get_model_size(model)
res["size"] = size_in_mb
res

{'accuracy': 0.8312559923298178,
 'f1': 0.8828229027962716,
 'cuda time': 11.242987871170044,
 'cpu time': 38.5801739692688,
 'size': 850.3095703125}

In [15]:
import json
with open("results/t5_cola.json", "w") as json_file:
    json.dump(res, json_file, indent=4)


### Quantization
Tutorial: https://pytorch.org/tutorials/recipes/quantization.html

#### dynamic quantization

In [16]:
model = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola")

device = "cpu"
model_dynamic_quantized_int8 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
)

In [17]:
device = "cpu"
model_dynamic_quantized_int8.to(device)

model_dynamic_quantized_int8.eval()
model_dynamic_quantized_int8.to(device)

start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model_dynamic_quantized_int8.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))]
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2

{'accuracy': 0.8125599232981783,
 'f1': 0.8753586228881096,
 'cpu time': 16.79375648498535}

In [18]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_int8)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8125599232981783,
 'f1': 0.8753586228881096,
 'cpu time': 16.79375648498535,
 'cuda time': None,
 'size': 94.3095703125}

In [19]:
import json
with open("results/t5_cola_dynamic_qint8.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")

# with open("./models/bert_int8.pkl", "wb") as f:
#     pickle.dump(model, f)

In [20]:

device = "cpu"
model_dynamic_quantized_float16 = torch.quantization.quantize_dynamic(
    model, qconfig_spec={torch.nn.Linear}, dtype=torch.float16
)

In [22]:
device = "cpu"
model_dynamic_quantized_float16.to(device)

model_dynamic_quantized_float16.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_ids = model_dynamic_quantized_float16.generate(**batch)
        output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
        
        # print(output_text)
    
    # logits = outputs.logits
    # predictions = torch.argmax(logits, dim=-1)
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



{'accuracy': 0.8312559923298178,
 'f1': 0.8828229027962716,
 'cpu time': 26.936930179595947}

In [23]:
res2["cuda time"] = None
size_in_mb = get_model_size(model_dynamic_quantized_float16)
res2["size"] = size_in_mb
res2

{'accuracy': 0.8312559923298178,
 'f1': 0.8828229027962716,
 'cpu time': 26.936930179595947,
 'cuda time': None,
 'size': 94.3095703125}

In [24]:
import json
with open("results/t5_cola_dynamic_float16.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")

# with open("./models/bert_float16.pkl", "wb") as f:
#     pickle.dump(model, f)

### Model Prunning
Tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

##### L1-Norm Unstructure Prunning

In [7]:
import torch.nn.utils.prune as prune
model_prun_unstructure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola")
# model.bert.embeddings

In [26]:
# model_prun_unstructure

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [8]:
from evaluate import load
import time
import os

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,4):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")
    pruning_amount=i/10
    model_prun_unstructure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola")
    
    for name, module in model_prun_unstructure.named_modules():
        if hasattr(module, "weight"):  # Check if the module has a weight parameter
            prune.l1_unstructured(module, name="weight", amount=pruning_amount)
            prune.remove(module, name="weight")

    
    device = "cuda"
    model_prun_unstructure.to(device)
    
    model_prun_unstructure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            generated_ids = model_prun_unstructure.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
            # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
            # print(output_text)
        
        # logits = outputs.logits
        # predictions = torch.argmax(logits, dim=-1)
        predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
        
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("unstructure")
    prun_data["percent"].append(i*10)


    # device = "cpu"
    # model_prun_unstructure.to(device)
    
    # model_prun_unstructure.eval()
    # start = time.time()
    # for batch in eval_dataloader:
    #     batch = {k: v.to(device) for k, v in batch.items()}
    #     with torch.no_grad():
    
    #         generated_ids = model_prun_unstructure.generate(**batch)
    #         output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    #         # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
    #         # print(output_text)
        
    #     # logits = outputs.logits
    #     # predictions = torch.argmax(logits, dim=-1)
    #     predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
        
    #     metric.add_batch(predictions=predictions, references=batch["labels"])
    # end = time.time()
    # prun_data["cpu time"].append(end - start)

prun_data



prun percent 10%




{'accuracy': 0.6740172579098753, 'f1': 0.7680763983628922}
prun percent 20%




{'accuracy': 0.5407478427612655, 'f1': 0.6266562743569758}
prun percent 30%




{'accuracy': 0.3796740172579099, 'f1': 0.30802139037433157}


{'percent': [10, 20, 30],
 'f1': [0.7680763983628922, 0.6266562743569758, 0.30802139037433157],
 'cuda time': [20.210673093795776, 41.29215908050537, 53.83593821525574],
 'cpu time': [],
 'accuracy': [0.6740172579098753, 0.5407478427612655, 0.3796740172579099],
 'type': ['unstructure', 'unstructure', 'unstructure']}

In [None]:
prun_data

In [9]:
import json
with open("results/t5_cola_prun_unstructure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")


##### Prun structure 


In [10]:

prun_data = {"percent":[],"f1":[],"cuda time":[],"cpu time":[],"accuracy":[],"type":[],}
for i in range(1,4):
    print(f"prun percent {i*10}%")
    metric = load("glue",config_name="mrpc")

    model_prun_structure = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola")
    amt = i/10

    for name, module in model_prun_structure.named_modules():
        if hasattr(module, "weight") and module.weight.ndim > 1:  # Check if the module has a weight parameter
            prune.ln_structured(module, name="weight", amount=amt, n=2,dim=0)
            prune.remove(module, name="weight")

    
    device = "cuda"
    model_prun_structure.to(device)
    
    model_prun_structure.eval()
    start = time.time()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
    
            generated_ids = model_prun_structure.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
            # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
            # print(output_text)
        
        # logits = outputs.logits
        # predictions = torch.argmax(logits, dim=-1)
        predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
        
        metric.add_batch(predictions=predictions, references=batch["labels"])
    end = time.time()
    metric_res = metric.compute()
    print(metric_res)
    prun_data["cuda time"].append(end - start)
    prun_data["f1"].append(metric_res["f1"])
    prun_data["accuracy"].append(metric_res["accuracy"])
    prun_data["type"].append("ln_structure")
    prun_data["percent"].append(i*10)


    
    # device = "cpu"
    # model_prun_structure.to(device)
    
    # model_prun_structure.eval()
    # start = time.time()
    # for batch in eval_dataloader:
    #     batch = {k: v.to(device) for k, v in batch.items()}
    #     with torch.no_grad():
    
    #         generated_ids = model_prun_structure.generate(**batch)
    #         output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    #         # output_text = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(len(generated_ids))]
            
    #         # print(output_text)
        
    #     # logits = outputs.logits
    #     # predictions = torch.argmax(logits, dim=-1)
    #     predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
        
    #     metric.add_batch(predictions=predictions, references=batch["labels"])
    # end = time.time()
    # prun_data["cpu time"].append(end - start)

prun_data

prun percent 10%




{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 20%




{'accuracy': 0.3087248322147651, 'f1': 0.0}
prun percent 30%




{'accuracy': 0.3087248322147651, 'f1': 0.0}


{'percent': [10, 20, 30],
 'f1': [0.0, 0.0, 0.0],
 'cuda time': [6.5053088665008545, 14.838896989822388, 39.54865026473999],
 'cpu time': [],
 'accuracy': [0.3087248322147651, 0.3087248322147651, 0.3087248322147651],
 'type': ['ln_structure', 'ln_structure', 'ln_structure']}

In [11]:
import json
with open("results/bert_mprc_prun_structure.json", "w") as json_file:
    json.dump(prun_data, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")


In [25]:
# prun_data

### Flash Attention 

In [12]:
import time
import torch
from evaluate import load

# model_sdpa = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola" ,attn_implementation="sdpa")
# metric = load("glue","cola")

# device = "cpu"
# model_sdpa.to(device)

# model_sdpa.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_sdpa(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# res2 = metric.compute()
# res2["cpu time"] = end - start
# res2



In [35]:

# device = "cuda"
# metric = load("glue","cola")

# model_sdpa.to(device)

# model_sdpa.eval()
# start = time.time()
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     # with torch.no_grad():
#     with torch.inference_mode():
#         # raise error if no optimized kernel is available
#         with torch.backends.cuda.sdp_kernel(
#             enable_flash=True, enable_math=True, enable_mem_efficient=True
#         ):
#             outputs = model_sdpa(**batch)
#         # print(outputs)
#         # break
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# end = time.time()
# # res2 = metric.compute()
# res2["cuda time"] = end - start
# res2
# size_in_mb = get_model_size(model_qat)
# res2["size"] = size_in_mb
# res2

  self.gen = func(*args, **kwds)


NameError: name 'model_qat' is not defined

In [None]:
# import json
# import pickle
# with open("results/t5_cola_sdpa.json", "w") as json_file:
#     json.dump(res2, json_file, indent=4)
# # torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")

# # with open("./models/bert_sdpa.pkl", "wb") as f:
# #     pickle.dump(model_sdpa, f)

In [13]:
model_eager = T5ForConditionalGeneration.from_pretrained("PavanNeerudu/t5-base-finetuned-cola" ,attn_implementation="eager")

device = "cpu"
model_eager.to(device)
metric = load("glue","cola")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
            
            generated_ids = model_eager.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
        
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
res2 = metric.compute()
res2["cpu time"] = end - start
res2



  self.gen = func(*args, **kwds)


{'matthews_correlation': np.float64(0.5891424967516642),
 'cpu time': 33.72944641113281}

In [14]:

device = "cuda"
model_eager.to(device)
metric = load("glue","cola")

model_eager.eval()
start = time.time()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # with torch.no_grad():
    with torch.inference_mode():
        # raise error if no optimized kernel is available
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
        ):
 
            generated_ids = model_eager.generate(**batch)
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    predictions = [1 if output_text[i] == "acceptable" else 0 for i in range(len(output_text))] 
    metric.add_batch(predictions=predictions, references=batch["labels"])
end = time.time()
# res2 = metric.compute()
res2["cuda time"] = end - start
res2


  self.gen = func(*args, **kwds)


{'matthews_correlation': np.float64(0.5891424967516642),
 'cpu time': 33.72944641113281,
 'cuda time': 7.704435586929321}

In [15]:
import json
with open("results/t5_cola_eager.json", "w") as json_file:
    json.dump(res2, json_file, indent=4)
# torch.save(model_dynamic_quantized, "./models/t5_cola_dynamic_qint8")

# with open("./models/bert_eager.pkl", "wb") as f:
#     pickle.dump(model_eager, f)