In [1]:
#!pip install datasets==1.4.1

In [2]:
#!pip install transformers

In [3]:
#!pip install accelerate -U

In [4]:
#!python -m pip install -U nn_pruning

In [5]:
#!nvidia-smi

In [6]:
#!sudo kill -9 30689

In [7]:
import torch
import datasets
import transformers
datasets.logging.set_verbosity_error()
transformers.logging.set_verbosity_error()
print(f"Using transformers v{transformers.__version__} and datasets v{datasets.__version__} and torch v{torch.__version__}")

Using transformers v4.35.0 and datasets v1.4.1 and torch v2.1.0+cu121


In [8]:
from datasets import load_dataset

sst = load_dataset("glue", "sst2")


In [9]:
sst['train'][0]

{'idx': 0,
 'label': 0,
 'sentence': 'hide new secretions from the parental units '}

In [10]:
sst.rename_column("label", "labels")

DatasetDict({
    train: Dataset({
        features: ['sentence', 'idx', 'labels'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'idx', 'labels'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'idx', 'labels'],
        num_rows: 1821
    })
})

In [11]:
#!pip install --upgrade --quiet jupyter_client ipywidgets

In [12]:
from transformers import AutoTokenizer

bert_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_ckpt)

In [13]:
def tokenize_and_encode(examples):
    return tokenizer(examples['sentence'], truncation="only_second")

sst_enc = sst.map(tokenize_and_encode, batched=True)

In [14]:
#!pip install nn_pruning

In [15]:
from transformers import Trainer
from nn_pruning.sparse_trainer import SparseTrainer

class PruningTrainer(SparseTrainer, Trainer):
    def __init__(self, sparse_args, *args, **kwargs):
        Trainer.__init__(self, *args, **kwargs)
        SparseTrainer.__init__(self, sparse_args)

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        self.metrics["ce_loss"] += float(loss)
        self.loss_counter += 1
        return (loss, outputs) if return_outputs else loss

In [16]:
from nn_pruning.patch_coordinator import SparseTrainingArguments

sparse_args = SparseTrainingArguments()
sparse_args

SparseTrainingArguments(mask_scores_learning_rate=0.01, dense_pruning_method='topK', attention_pruning_method='topK', ampere_pruning_method='disabled', attention_output_with_dense=True, bias_mask=True, mask_init='constant', mask_scale=0.0, dense_block_rows=1, dense_block_cols=1, attention_block_rows=1, attention_block_cols=1, initial_threshold=1.0, final_threshold=0.5, initial_warmup=1, final_warmup=2, initial_ampere_temperature=0.0, final_ampere_temperature=20.0, regularization='disabled', regularization_final_lambda=0.0, attention_lambda=1.0, dense_lambda=1.0, distil_teacher_name_or_path=None, distil_alpha_ce=0.5, distil_alpha_teacher=0.5, distil_temperature=2.0, final_finetune=False, layer_norm_patch=False, layer_norm_patch_steps=50000, layer_norm_patch_start_delta=0.99, gelu_patch=False, gelu_patch_steps=50000, linear_min_parameters=0.005, rewind_model_name_or_path=None)

In [17]:
hyperparams = {
    "dense_pruning_method": "topK:1d_alt",
    "attention_pruning_method": "topK",
    "initial_threshold": 1.0,
    "final_threshold": 0.5,
    "initial_warmup": 1,
    "final_warmup": 3,
    "attention_block_rows":32,
    "attention_block_cols":32,
    "attention_output_with_dense": 0
}

for k,v in hyperparams.items():
    if hasattr(sparse_args, k):
        setattr(sparse_args, k, v)
    else:
        print(f"sparse_args does not have argument {k}")

In [18]:
from transformers import TrainingArguments

batch_size = 16
learning_rate = 2e-5
num_train_epochs = 6
logging_steps = len(sst_enc["train"]) // batch_size
warmup_steps = logging_steps * num_train_epochs * 0.1

args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="epoch",
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.01,
    logging_steps=logging_steps,
    save_strategy="epoch",
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=None
)

In [19]:
import torch
from transformers import AutoModelForSequenceClassification
from nn_pruning.patch_coordinator import ModelPatchingCoordinator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mpc = ModelPatchingCoordinator(
    sparse_args=sparse_args,
    device=device,
    cache_dir="checkpoints",
    logit_names="logits",
    teacher_constructor=None)

In [20]:
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_ckpt).to(device)
mpc.patch_model(bert_model)

bert_model.save_pretrained("models/patched")

In [21]:
import numpy as np
from datasets import load_metric

accuracy_score = load_metric('accuracy')

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score.compute(predictions=predictions, references=labels)

In [22]:
trainer = PruningTrainer(
    sparse_args=sparse_args,
    args=args,
    model=bert_model,
    train_dataset=sst_enc["train"],
    eval_dataset=sst_enc["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [23]:
trainer.set_patch_coordinator(mpc)

In [24]:
trainer.train();



Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second,Steps Per Second,Threshold,Regu Lambda,Ampere Temperature
1,0.2247,0.569062,0.768349,6.988,124.785,7.871,0.5,0.0,20.0
2,0.1473,0.330965,0.847477,6.967,125.162,7.894,0.5,0.0,20.0
3,0.1181,0.347085,0.888761,7.0027,124.523,7.854,0.5,0.0,20.0
4,0.0938,0.398867,0.897936,6.9928,124.701,7.865,0.5,0.0,20.0
5,0.0724,0.412107,0.901376,7.0057,124.47,7.851,0.5,0.0,20.0
6,0.0535,0.460766,0.905963,6.9901,124.747,7.868,0.5,0.0,20.0


In [25]:
output_model_path = "models/bert-base-uncased-finepruned-sst"
trainer.save_model(output_model_path)

In [26]:
mpc.compile_model(trainer.model)

(0, 144)

In [29]:
from nn_pruning.inference_model_patcher import optimize_model

prunebert_model = optimize_model(trainer.model, "dense")

removed heads 0, total_heads=144, percentage removed=0.0
bert.encoder.layer.0.intermediate.dense, sparsity = 50.00
bert.encoder.layer.0.output.dense, sparsity = 50.00
bert.encoder.layer.1.intermediate.dense, sparsity = 50.00
bert.encoder.layer.1.output.dense, sparsity = 50.00
bert.encoder.layer.2.intermediate.dense, sparsity = 50.00
bert.encoder.layer.2.output.dense, sparsity = 50.00
bert.encoder.layer.3.intermediate.dense, sparsity = 50.00
bert.encoder.layer.3.output.dense, sparsity = 50.00
bert.encoder.layer.4.intermediate.dense, sparsity = 50.00
bert.encoder.layer.4.output.dense, sparsity = 50.00
bert.encoder.layer.5.intermediate.dense, sparsity = 50.00
bert.encoder.layer.5.output.dense, sparsity = 50.00
bert.encoder.layer.6.intermediate.dense, sparsity = 50.00
bert.encoder.layer.6.output.dense, sparsity = 50.00
bert.encoder.layer.7.intermediate.dense, sparsity = 50.00
bert.encoder.layer.7.output.dense, sparsity = 50.00
bert.encoder.layer.8.intermediate.dense, sparsity = 50.00
bert.

In [30]:
prunebert_model.num_parameters()

81153794

In [31]:
bert_model.num_parameters()

109483778

In [32]:
prunebert_model.num_parameters() / bert_model.num_parameters()

0.7412403506937804

In [33]:
from time import perf_counter

def compute_latencies(model,
                      question="Is Saving Private Ryan based on a book?",
                      passage="""In 1994, Robert Rodat wrote the script for the film. Rodat’s script was submitted to
                      producer Mark Gordon, who liked it and in turn passed it along to Spielberg to direct. The film is
                      loosely based on the World War II life stories of the Niland brothers. A shooting date was set for
                      June 27, 1997"""):
    inputs = tokenizer(question, passage, truncation="only_second", return_tensors="pt")
    latencies = []

    # Warmup
    for _ in range(10):
        _ = model(**inputs)

    for _ in range(100):
        start_time = perf_counter()
        _ = model(**inputs)
        latency = perf_counter() - start_time
        latencies.append(latency)
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

In [34]:
latencies = {}
latencies["prunebert"] = compute_latencies(prunebert_model.to("cpu"))

Average latency (ms) - 59.39 +\- 0.31


In [35]:
bert_unpruned = AutoModelForSequenceClassification.from_pretrained("doyoungkim/bert-base-uncased-finetuned-sst2").to("cpu")

latencies["bert-base"] = compute_latencies(bert_unpruned.to("cpu"))

Average latency (ms) - 84.14 +\- 0.20


In [None]:
#ref: https://github.com/huggingface/nn_pruning