In [20]:
from torch.nn.utils import prune
import copy

SPARSITY_LIST = [.1, .5, .9, .95, .99]

In [3]:
def get_weight_parameters(layer):
    '''
    Get all parameters/modules identified as 'weight'
    '''
    weight_parameters = []
    if len(list(layer.children())) > 0:
        for child in layer.children():
            for param in child.named_parameters():
                if 'weight' == param[0]:
                    # print(param)
                    weight_parameters.append((child, param[0]))
            weight_parameters.extend(get_weight_parameters(child))
    
    
    return weight_parameters


def prune_weight_parameters(model, prune_amount):
    '''
    Global pruning
    '''
    params_to_prune = get_weight_parameters(model)

    prune.global_unstructured(
        params_to_prune, 
        pruning_method=prune.L1Unstructured, 
        amount=prune_amount,
    )

    for module, name in params_to_prune:
        try:
            prune.remove(module, name)
        except Exception as e:
            print(e)
    return model

## Sparsifying RoBERTa, GPT-2, BART

In [1]:
from transformers import RobertaModel, GPT2Model, BartModel
roberta_model = RobertaModel.from_pretrained("roberta-large")
gpt2_model = GPT2Model.from_pretrained("gpt2")
bart_model = BartModel.from_pretrained("facebook/bart-large")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
model_list = [("roberta", roberta_model), ("gpt2", gpt2_model), ("bart", bart_model)]

for name, model in model_list:
    for sparsity in SPARSITY_LIST:
        model_to_prune = copy.deepcopy(model)
        pruned_model = prune_weight_parameters(model_to_prune, sparsity)
        pruned_model.save_pretrained(f"models/{name}_{sparsity}")


## Sparsifying GPT-2 for GLUE

In [22]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer

# Define a padding token and save
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(f"models/gpt2-glue-tokenizer")

# Add padding to gpt2 in order to do GLUE task
# num_labels=2 bc GLUE-mrpc is a binary classification dataset
gpt2_glue = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
gpt2_glue.config.pad_token_id = tokenizer.pad_token_id
# Save gpt2 model with padding
gpt2_glue.save_pretrained("models/gpt2-glue")

for sparsity in SPARSITY_LIST:
    model_to_prune = copy.deepcopy(gpt2_glue)
    pruned_model = prune_weight_parameters(model_to_prune, sparsity)
    pruned_model.save_pretrained(f"models/gpt2-glue_{sparsity}")

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
