In [1]:
import torch
import torch.nn as nn
import numpy as np
import sys
import datetime
import time

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

# from sklearn.metrics import f1_score

from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

np.random.seed(42)
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f0e5df2e610>

In [2]:
task_name = "mnli" # cola, mnli, qnli, qqp
batch_size = 32
max_length = 128
rt_filename = 'bert-base.plan'

In [3]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
saved_path = f'../ignore/task/bert-base_{task_name}.pt'


task = {
    "qnli":{
        "num_labels": 2,
        "test_dataset_name": "validation",
        "tokenize": lambda data:tokenizer(data['question'], data['sentence'], truncation=True, max_length=max_length, padding='max_length')
    },
    "mnli":{
        "num_labels": 3,
        "test_dataset_name": "validation_matched",
        "tokenize": lambda data:tokenizer(data['premise'], data['hypothesis'], truncation=True, max_length=max_length, padding='max_length')
    },
    "qqp":{
        "num_labels": 2,
        "test_dataset_name": "validation",
        "tokenize": lambda data:tokenizer(data['question1'], data['question2'], truncation=True, max_length=max_length, padding='max_length')
    },
    "cola":{
        "num_labels": 2,
        "test_dataset_name": "validation",
        "tokenize": lambda data:tokenizer(data['sentence'], truncation=True, max_length=max_length, padding='max_length')
    }
}
task = task[task_name]
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=task["num_labels"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset = load_dataset('glue', task_name)
tokenize = task['tokenize']
tokenized_dataset = tokenize(dataset[task['test_dataset_name']])

input_ids = np.array(tokenized_dataset['input_ids'])
attention_masks = np.array(tokenized_dataset['attention_mask'])
labels = np.array(dataset[task['test_dataset_name']]['label'])

max_size = labels.shape[0] - (labels.shape[0] % 32)
test_dataset = {'input_ids':input_ids[:max_size,:], 'attention_masks':attention_masks[:max_size,:], 'labels':labels[:max_size], 'size': max_size}

Downloading data: 100%|██████████| 313M/313M [00:10<00:00, 29.0MB/s] 
Generating train split: 100%|██████████| 392702/392702 [00:27<00:00, 14258.24 examples/s]
Generating validation_matched split: 100%|██████████| 9815/9815 [00:00<00:00, 17068.61 examples/s]
Generating validation_mismatched split: 100%|██████████| 9832/9832 [00:00<00:00, 15524.43 examples/s]
Generating test_matched split: 100%|██████████| 9796/9796 [00:00<00:00, 16400.62 examples/s]
Generating test_mismatched split: 100%|██████████| 9847/9847 [00:00<00:00, 17985.00 examples/s]
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed

In [5]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
trt_runtime = trt.Runtime(TRT_LOGGER)
 
with open(rt_filename, 'rb') as f:
    engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
 
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
 
    def __repr__(self):
        return self.__str__()
 
inputs, outputs, bindings, stream = [], [], [], []
for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    
    # Allocate host and device buffers
    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc_like(host_mem)
    bindings.append(int(device_mem))
    if engine.binding_is_input(binding):
        inputs.append(HostDeviceMem(host_mem, device_mem))
    else:
        outputs.append(HostDeviceMem(host_mem, device_mem))
context = engine.create_execution_context()

In [6]:
def update_progress(progress):
    sys.stdout.write('\r%d%%' % progress)
    sys.stdout.flush()

def format_time(time):
    time_rounded = int(round((time)))
    return str(datetime.timedelta(seconds=time_rounded))

def eval():
    labels = []
    predictions = []
    index = 0
    
    stream = cuda.Stream()
    num_data = test_dataset['size']
    start_time = time.time()
    
    while(index < num_data):
        
        input_ids = test_dataset['input_ids'][index:index+batch_size,:]
        attention_masks = test_dataset['attention_masks'][index:index+batch_size,:]
        label = test_dataset['labels'][index:index+batch_size]
        
        hosts = [input.host for input in inputs]
        input_array = [input_ids, attention_masks]
        
        for input_array, host in zip(input_array, hosts):
            input_array = np.asarray(input_array).astype(trt.nptype(trt.int32)).ravel()
            np.copyto(host, input_array)
            
        [cuda.memcpy_htod_async(input.device, input.host, stream) for input in inputs]
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        [cuda.memcpy_dtoh_async(output.host, output.device, stream) for output in outputs]
        stream.synchronize()
        prediction = [output.host for output in outputs]
        
        
        prediction = np.array(prediction).reshape(batch_size, -1).argmax(1)
        predictions.append(prediction)
        labels.append(np.array(label))

        index += batch_size
        update_progress(index / num_data * 100)
        
    labels = np.concatenate(labels)
    predictions = np.concatenate(predictions)
        
    # print(f' f1: {f1_score(labels, predictions)}, evaluating loss: {avg_eval_loss:.4f}')
    print(f' {np.sum(predictions == labels)} / {predictions.shape[0]} ')
    print(f' --- evaluation finished {format_time(time.time() - start_time)}')

In [7]:
eval()

100% 8155 / 9792 
 --- evaluation finished 0:00:37
