In [4]:
import torch
import argparse
from dataclasses import dataclass, field
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments,
    AutoTokenizer,PreTrainedTokenizerFast,
    AutoModel,
    BertForSequenceClassification,
    AutoModelForSequenceClassification,
    AutoConfig,
    EvalPrediction,
    HfArgumentParser,
    default_data_collator,
    set_seed,
    )
from trainer_supernet import SupernetTrainer
from transformers import RobertaTokenizerFast, T5Tokenizer
from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification, T5ForConditionalGeneration
import numpy as np
from datasets import load_dataset, concatenate_datasets, load_from_disk
import logging
import sys
import copy
import os
from scipy.stats import pearsonr
from nncf import NNCFConfig


# 0. Create Argument

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_seq_length: int = field(
        default=384,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
                " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
    null_score_diff_threshold: float = field(
        default=0.0,
        metadata={
            "help": (
                "The threshold used to select the null answer: if the best answer has a score that is less than "
                "the score of the null answer minus this threshold, the null answer is selected for this example. "
                "Only useful when `version_2_with_negative=True`."
            )
        },
    )
    doc_stride: int = field(
        default=128,
        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
    )
    n_best_size: int = field(
        default=20,
        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
    )
    max_answer_length: int = field(
        default=30,
        metadata={
            "help": (
                "The maximum length of an answer that can be generated. This is needed because the start "
                "and end predictions are not conditioned on one another."
            )
        },
    )

    def __post_init__(self):
        if (
            self.dataset_name is None
            and self.train_file is None
            and self.validation_file is None
            and self.test_file is None
        ):
            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
            if self.test_file is not None:
                extension = self.test_file.split(".")[-1]
                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."


## 1. Processing training data
### 1.1 Define a tikenize function

In [5]:
def tokenize_function(examples, tokenizer, dataset,model=None):
    if dataset in ["sst2", "cola"]:

        return tokenizer(examples['sentence'], padding="max_length", truncation=True,return_tensors="pt")

    elif dataset == "mnli":
        return tokenizer(examples["premise"], examples["hypothesis"], padding="max_length", truncation=True,return_tensors="pt")
    elif dataset == "qqp":
        return tokenizer(examples["question1"], examples["question2"], padding="max_length", truncation=True,return_tensors="pt")
    elif dataset == "qnli":
        return tokenizer(examples["question"], examples["sentence"], padding="max_length", truncation=True,return_tensors="pt")

    elif dataset in ["mrpc", "stsb", "rte"]:
        return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True,return_tensors="pt")


### 1.2 Load the dataset

In [6]:
dataset_name = 'sst2'
    
num_classes = {
    "mnli": 3,
    "qqp": 2,
    "qnli": 2,
    "sst2": 2,
    "stsb": 1,
    "mrpc": 2,
    "rte": 2,
    "cola": 2,
    }

dataset = load_dataset("glue", dataset_name)
train_dataset = dataset["train"]
test_dataset = dataset["validation"]

Found cached dataset glue (/homes/yusx/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 129.82it/s]


## 2. Load Model & Tokenizer
### 2.1 Get model configuration

In [7]:
model_name = 'bert-large-uncased-whole-word-masking'

config = AutoConfig.from_pretrained(model_name)

config.num_labels = num_classes[dataset_name]

print(config)


BertConfig {
  "_name_or_path": "bert-large-uncased-whole-word-masking",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [8]:
# from transformers import AutoModelForQuestionAnswering
# model_qa = AutoModelForQuestionAnswering.from_pretrained(model_name, config = config)
# print(model_qa)
# model_qa = None

### 2.2 Initialize pre-trained model

In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

# kd_teacher_model = copy.deepcopy(model)
print(model)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-un

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

### 2.3 Define the tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
   model_name,
   use_fast=True,
)
print(tokenizer)
if not isinstance(tokenizer, PreTrainedTokenizerFast):
    raise ValueError(
        "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
        " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
        " this requirement"
    )

BertTokenizerFast(name_or_path='bert-large-uncased-whole-word-masking', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


### 2.4 Tokenize the dataset

In [11]:
tokenized_train_dataset = train_dataset.map(lambda examples: tokenize_function(examples, tokenizer, dataset_name), batched=True)
tokenized_test_dataset = test_dataset.map(lambda examples: tokenize_function(examples, tokenizer, dataset_name), batched=True)
# logging.info("=====> train_dataset size: {}".format(len(tokenized_train_dataset)))
print("=====> train_dataset size: {}".format(len(tokenized_train_dataset)))

Loading cached processed dataset at /homes/yusx/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a4ff0bfa4db56cf0.arrow
Loading cached processed dataset at /homes/yusx/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-19ce2cdb4180a76a.arrow


=====> train_dataset size: 67349


### 2.5 Define Evaluation Function

In [12]:
def compute_metrics(eval_pred, task):
    predictions, labels = eval_pred
    if task == "stsb":
        pearson_corr, _ = pearsonr(predictions.squeeze(), labels)
        return {"pearson_corr": pearson_corr}
    else:
    
        predictions = predictions.argmax(-1)
        return {"accuracy": accuracy_score(labels, predictions)}

## 3. Create NNCF Supernet from Base Foundation Model

In [13]:
# Manually setup some args to align with the arguments in BoostrapNAS demo
class Args:
    pass

training_args = Args()
training_args.__setattr__('kd_teacher_model',model_name)


In [14]:
kd_teacher_model = None
if training_args.kd_teacher_model:
    kd_teacher_model = BertForSequenceClassification.from_pretrained(
        training_args.kd_teacher_model,
        from_tf=bool(".ckpt" in training_args.kd_teacher_model),
        # cache_dir=model_args.cache_dir,
    )
    kd_teacher_model.eval()

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-un

In [15]:
# Manually setup some args to align with the arguments in BoostrapNAS demo
training_args.__setattr__('nncf_config','./nncf_config.json')
training_args.__setattr__('do_train',True)
training_args.__setattr__('do_eval',True)
training_args.__setattr__('output_dir','./log_dir/')

In [16]:
# Manually setup some args to align with the arguments in BoostrapNAS demo
model_args = Args()
model_args.__setattr__('model_revision','model_revision')
model_args.__setattr__('use_auth_token',False)
model_args.__setattr__('model_name_or_path',model_name)

In [18]:
nncf_config = None
create_importance_mask_fn = None
if training_args.nncf_config is not None:
    nncf_config = NNCFConfig.from_json(training_args.nncf_config)
    if nncf_config.get("log_dir") is None:
        nncf_config["log_dir"] = training_args.output_dir
    if not os.path.exists(training_args.output_dir) and training_args.local_rank in [-1, 0]:
        os.makedirs(nncf_config["log_dir"])

    # split nncf_config --> bnas + movement sparsity
    compression_algo_names = [algo['algorithm'] for algo in nncf_config.get('compression', [])]
    if 'movement_sparsity' in compression_algo_names:
        sparsity_nncf_config = copy.deepcopy(nncf_config)
        if 'bootstrapNAS' in sparsity_nncf_config:
            del sparsity_nncf_config['bootstrapNAS']

        def generate_importance_mask_weight(model_state_dict, debug_mode=False, save_folder='movement_sparsity',
                                            resume_model='/data2/yzheng/bert_squad_result/weight_reorg/checkpoint-29509/pytorch_model.bin'):
            # create movement sparsity ctrl
            sparsity_ctrl, sparsity_model = BertForSequenceClassification.from_pretrained(
                model_name,
                from_tf=bool(".ckpt" in model_name),
                config=config,
                # cache_dir=model_args.cache_dir,
                revision=model_args.model_revision,
                use_auth_token=True if model_args.use_auth_token else None,
                nncf_config=sparsity_nncf_config,
                nncf_eval=nncf_config is not None and training_args.do_eval and not training_args.do_train,
            )
            sparsity_model.load_state_dict(model_state_dict, strict=False) # use current bnas model state dict

            sparsity_trainer = SupernetTrainer(
                model=sparsity_model,
                args=training_args,
                train_dataset=tokenized_train_dataset if training_args.do_train else None,
                eval_dataset=tokenized_train_dataset if training_args.do_eval else None,
                eval_examples=test_dataset if training_args.do_eval else None,
                tokenizer=tokenizer,
                # data_collator=data_collator,
                # post_process_function=post_processing_function,
                compute_metrics=lambda eval_pred: compute_metrics(eval_pred, dataset_name),
                compression_ctrl=sparsity_ctrl,
                kd_teacher_model=kd_teacher_model,
                debug_mode=debug_mode,
                resume_model=resume_model
            )

            sparsity_trainer.train()
            sparsity_trainer.save_model(os.path.join(training_args.output_dir, save_folder))
            return sparsity_model

        create_importance_mask_fn = generate_importance_mask_weight

retval = BertForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    # cache_dir=model_args.cache_dir,
    # revision=model_args.model_revision,
    # use_auth_token=True if model_args.use_auth_token else None,
    nncf_config=nncf_config,
    nncf_eval=nncf_config is not None and training_args.do_eval and not training_args.do_train,
    )

if nncf_config is None:
    model = retval
    compression_ctrl = None
else:
    # if movement sparsity is in config,
    # then compression_ctrl = [bnas_ctrl, movement_sparsity_ctrl]
    # model = [bnas_model, movement_sparsity_model]
    compression_ctrl, model = retval
    compression_ctrl.multi_elasticity_handler.width_handler.create_importance_mask_fn = create_importance_mask_fn


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-un

In [20]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): NNCFEmbedding(
        30522, 1024, padding_idx=0
        (pre_ops): ModuleDict()
        (post_ops): ModuleDict()
      )
      (position_embeddings): NNCFEmbedding(
        512, 1024
        (pre_ops): ModuleDict()
        (post_ops): ModuleDict()
      )
      (token_type_embeddings): NNCFEmbedding(
        2, 1024
        (pre_ops): ModuleDict()
        (post_ops): ModuleDict()
      )
      (LayerNorm): NNCFLayerNorm(
        (1024,), eps=1e-12, elementwise_affine=True
        (pre_ops): ModuleDict(
          (0): UpdateLayerNormParams(
            (op): ElasticInputWidthLayerNormOp()
          )
        )
        (post_ops): ModuleDict()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
            

## Training

In [26]:
training_args.__setattr__('full_determinism',False)
training_args.__setattr__('seed',0)

In [27]:
trainer = SupernetTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset if training_args.do_train else None,
    eval_dataset=tokenized_test_dataset if training_args.do_eval else None,
    eval_examples=test_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    # data_collator=data_collator,
    # post_process_function=post_processing_function,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, dataset_name),
    compression_ctrl=compression_ctrl,
    kd_teacher_model=kd_teacher_model
)

AttributeError: 'Args' object has no attribute 'skip_memory_metrics'

In [None]:
# Training
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    if nncf_config is not None:
        train_result, model, elasticity_ctrl = train_result
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics
    max_train_samples = (
        data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    if nncf_config is not None and training_args.do_search:
        resuming_checkpoint_path = None
        if resuming_checkpoint_path is None:
            search_algo = SearchAlgorithm.from_config(model, elasticity_ctrl, nncf_config)
        else:
            search_algo = SearchAlgorithm.from_checkpoint(model, elasticity_ctrl, None, resuming_checkpoint_path)

        def validate_model_func(model_, dataset_):
            #trainer.model will be used to evaluate(trainer.model = model)
            metrics = trainer.evaluate(eval_dataset=dataset_)
            return metrics['eval_f1']

        elasticity_ctrl, best_config, performance_metrics = search_algo.run(validate_model_func,
                                                                            eval_dataset,
                                                                            training_args.output_dir)
        logger.info("Best config: {best_config}".format(best_config=best_config))
        logger.info("Performance metrics: {performance_metrics}".format(performance_metrics=performance_metrics))

        search_algo.visualize_search_progression()
        search_algo.search_progression_to_csv()
        search_algo.evaluators_to_csv()