In [1]:
import torch
import tqdm

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [3]:
## Loading Dataset
print("_______________________________________________________________")
print("Loading Data...........")
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    r"test.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

_______________________________________________________________
Loading Data...........


In [4]:
print("_______________________________________________________________")
print("Data Pre-processing...........")
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

_______________________________________________________________
Data Pre-processing...........


In [5]:
data_clean = [clean_tweet(tweet) for tweet in data.text]
labels = data.sentiment.values
labels[labels == 4] = 1

In [6]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split

output_dir = r'C:\Users\tapojyoti.paul\Documents\Intel\BERT\Pytorch Model Bert'
# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
X = data_clean
def get_test_data(size: int = 1):
    """Generates a test dataset of the specified size""" 
    num_rows = len(X)
    test_df = X.copy()

    while num_rows < size:
        test_df = test_df + test_df
        num_rows = len(test_df)

    return test_df[:size]

In [10]:
def calculate_stats(time_list):
    """Calculate mean and standard deviation of a list"""
    time_array = np.array(time_list)

    median = np.median(time_array)
    mean = np.mean(time_array)
    std_dev = np.std(time_array)
    max_time = np.amax(time_array)
    min_time = np.amin(time_array)
    quantile_10 = np.quantile(time_array, 0.1)
    quantile_90 = np.quantile(time_array, 0.9)

    basic_key = ["median","mean","std_dev","min_time","max_time","quantile_10","quantile_90"]
    basic_value = [median,mean,std_dev,min_time,max_time,quantile_10,quantile_90]

    dict_basic = dict(zip(basic_key, basic_value))
    
    return pd.DataFrame(dict_basic, index = [0])

import argparse
import logging

from pathlib import Path
from timeit import default_timer as timer

NUM_LOOPS = 50

def run_inference(num_observations:int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_twt = get_test_data(num_observations)
    num_rows = len(test_twt)
    print(f"running data prep and inference for {num_rows} sentence(s)..")
    
    run_times = []
    bert_times = []
    prep_time_wo_berts = []
    prep_time_alls = []
    prep_inf_times = []
    inference_times = []
    
    for _ in range(NUM_LOOPS):
        
#######################################################################################################################
        st_tm_bert = timer()
        input_ids = []
        attention_masks = []
        # For every sentence...
        for sent in tqdm.tqdm(test_twt,desc ="Progress.."):
            encoded_dict = tokenizer.encode_plus(
                                sent,                      # Sentence to encode.
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                max_length = 64,           # Pad & truncate all sentences.
                                pad_to_max_length = True,
                                return_attention_mask = True,   # Construct attn. masks.
                                return_tensors = 'pt',     # Return pytorch tensors.
                           )
            # Add the encoded sentence to the list.    
            input_ids.append(encoded_dict['input_ids'])
            # And its attention mask (simply differentiates padding from non-padding).
            attention_masks.append(encoded_dict['attention_mask'])
        # Convert the lists into tensors.
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        batch_size = 32  
        # Create the DataLoader.
        prediction_data = TensorDataset(input_ids, attention_masks,)
        prediction_dataloader = DataLoader(prediction_data,batch_size=num_rows)
        
        end_tm_bert = timer()
        
        for batch in prediction_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
#             print("/////////////")
#             print(len(b_input_ids))
#             print("/////////////")
#######################################################################################################################
        start_time = timer()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        end_time = timer()
#######################################################################################################################

        total_time = end_time - start_time
        run_times.append(total_time*10e3)
        
        bert_time = (end_tm_bert-st_tm_bert)*(10e6)/num_rows
        prep_time_wo_bert = (start_time-end_tm_bert)*(10e6)/num_rows
        prep_time_all = (start_time-st_tm_bert)*(10e6)/num_rows
        inference_time = total_time*(10e6)/num_rows
        prep_inf_time = (end_time-st_tm_bert)*(10e6)/num_rows
        
        bert_times.append(bert_time)
        prep_time_wo_berts.append(prep_time_wo_bert)
        prep_time_alls.append(prep_time_all)
        prep_inf_times.append(prep_inf_time)
        inference_times.append(inference_time)
        
    print("length of predicted df", len(logits))
    
    df1 = calculate_stats(bert_times)
    df1["Flag"] = "Only Bert"
    df2 = calculate_stats(prep_time_wo_berts)
    df2["Flag"] = "Prep w/o Bert"
    df3 = calculate_stats(prep_time_alls)
    df3["Flag"] = "Prep with Bert"
    df4 = calculate_stats(prep_inf_times)
    df4["Flag"] = "Prep & Inf Time Total"
    df5 = calculate_stats(inference_times)
    df5["Flag"] = "Inference Time"

    dfs = pd.concat([df1,df2,df3,df5,df4])
    
    print(num_observations, ", ", dfs)
    return dfs

STATS = '#, median, mean, std_dev, min_time, max_time, quantile_10, quantile_90'

print("_______________________________________________________________")
print("Inferencing Started...........")
if __name__=='__main__':
    ob_ct = 1  # Start with a single observation
    logging.info(STATS)
    temp_df = pd.DataFrame()
    while ob_ct <= 10:
        temp = run_inference(ob_ct)
        temp["No_of_Observation"] = ob_ct
        temp_df = temp_df.append(temp)
        ob_ct *= 10
    print("Summary........")
    print(temp_df)

_______________________________________________________________
Inferencing Started...........


I1229 16:05:56.932788 16196 <ipython-input-10-84962f535538>:124] #, median, mean, std_dev, min_time, max_time, quantile_10, quantile_90


running data prep and inference for 1 sentence(s)..


Progress..: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 501.47it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 91.26it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Progress..: 100%|███████████████████████

length of predicted df 1
1 ,        median        mean        std_dev   min_time   max_time  quantile_10  \
0   100158.0   107537.86   29012.722779    73773.0   233599.0      83734.9   
0     6356.5     6490.48    2665.935560     1627.0    15007.0       2749.8   
0   107232.5   114028.34   29113.247762    84157.0   236266.0      88599.5   
0  2041719.5  2170719.98  416322.361473  1511379.0  3410131.0    1781055.1   
0  2151914.0  2284748.32  428045.406190  1597367.0  3505439.0    1892838.2   

   quantile_90                   Flag  
0     136129.0              Only Bert  
0      10084.2          Prep w/o Bert  
0     140867.5         Prep with Bert  
0    2719548.4         Inference Time  
0    2938582.8  Prep & Inf Time Total  
running data prep and inference for 10 sentence(s)..


Progress..: 100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1114.08it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 983.22it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 527.70it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 551.37it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 820.32it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 495.36it/s]
Progress..: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 798.28it/s]
Progress..: 100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1009.56it/s]
Progress..: 100%|███████████████████████

length of predicted df 10
10 ,         median         mean        std_dev   min_time   max_time  quantile_10  \
0    20809.50    23462.314   14732.811130    14530.9   122598.4     16232.73   
0      869.40     1015.836    1035.982739      317.7     7702.7       558.22   
0    21701.90    24478.150   15680.559572    15408.7   130301.1     17200.68   
0  1491818.20  1535361.300  189451.418500  1283627.8  2269201.7   1392124.93   
0  1514233.05  1559839.450  198718.635588  1301332.1  2399502.8   1412945.16   

   quantile_90                   Flag  
0     26774.61              Only Bert  
0      1260.33          Prep w/o Bert  
0     28117.88         Prep with Bert  
0   1678581.61         Inference Time  
0   1700216.76  Prep & Inf Time Total  
Summary........
       median         mean        std_dev   min_time   max_time  quantile_10  \
0   100158.00   107537.860   29012.722779    73773.0   233599.0     83734.90   
0     6356.50     6490.480    2665.935560     1627.0    15007.0      27