In [7]:
import copy, time, os, random

import numpy as np
import matplotlib.pyplot as plt
import tqdm

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

import torch.nn.utils.prune as prune                              

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import glue_convert_examples_to_features as convert_examples_to_features

from datasets import load_dataset, load_metric



In [8]:
MODEL_DIR = './roberta-base-tuned/'
OUTPUT_DIR = './quant-roberta/'
DATA_DIR = './data/glue_data/MRPC/'
DEVICE = 'cpu'
SEED = 24

TASK = 'mrpc'
LOCAL_RANK = -1
MODEL_TYPE = 'roberta'
MODEL_NAME_OR_PATH = "roberta-base"
# The maximum length of an input sequence
MAX_SEQ_LENGTH = 128

OUTPUT_MODE = output_modes[TASK]

N_GPU = 0
BATCH_SIZE = 8

In [9]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(SEED)

In [10]:
actual_task = "mnli" if TASK == "mnli-mm" else TASK
raw_datasets = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

Reusing dataset glue (C:\Users\Anton\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [11]:
    
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_DIR, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [12]:

def preprocess_function(examples):
    # Tokenize the texts
    texts = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*texts, padding=False, max_length=MAX_SEQ_LENGTH, truncation=True)

    if "label" in examples:
        result["labels"] = examples["label"]
    return result

sentence1_key, sentence2_key = ("sentence1", "sentence2")


processed_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

data_collator = DataCollatorWithPadding(tokenizer)


eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=BATCH_SIZE)

Loading cached processed dataset at C:\Users\Anton\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-e40ab1967a34dd5c.arrow
Running tokenizer on dataset: 100%|██████████| 1/1 [00:00<00:00, 11.37ba/s]
Loading cached processed dataset at C:\Users\Anton\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-98c3efa1bc7ee736.arrow


In [29]:
def evaluate(model, eval_dataloader, prefix=""):
    results = {}

    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for inputs in tqdm.tqdm(eval_dataloader, desc="Evaluating"):
        inputs = inputs.to(DEVICE)

        with torch.no_grad():
 
            if MODEL_TYPE != 'distilbert':
                inputs['token_type_ids'] = batch[2] if MODEL_TYPE in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if OUTPUT_MODE == "classification":
        preds = np.argmax(preds, axis=1)
    elif OUTPUT_MODE == "regression":
        preds = np.squeeze(preds)
    result = metric.compute(predictions=preds, references=out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(OUTPUT_DIR, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        for key in sorted(result.keys()):
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results




In [14]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)


In [15]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 498.676031
Size (MB): 242.157535


In [30]:
def time_model_evaluation(model, dataloader):
    eval_start_time = time.time()
    result = evaluate(model, dataloader, prefix="")
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print(result)
    print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))

# Evaluate the original FP32 BERT model
time_model_evaluation(model, eval_dataloader)

# Evaluate the INT8 BERT model after the dynamic quantization
time_model_evaluation(quantized_model, eval_dataloader)

Evaluating: 100%|██████████| 216/216 [02:23<00:00,  1.51it/s]
Evaluating:   0%|          | 0/216 [00:00<?, ?it/s]{'accuracy': 0.871304347826087, 'f1': 0.9056924384027187}
Evaluate total time (seconds): 143.2
Evaluating: 100%|██████████| 216/216 [01:52<00:00,  1.91it/s]{'accuracy': 0.8556521739130435, 'f1': 0.894177645558861}
Evaluate total time (seconds): 112.9

