# Testing BERT Based Hate Speech Binary Classifier

In [1]:
from tqdm.auto import tqdm
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

file_path = 'df_2020C_07_01_to_02_testdata.feather'
data_frame = pd.read_feather(file_path)
model_path = "./training_output/checkpoint-14000"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 2)
model.to(device)
model.eval()

def tokenize_function(examples):
    return tokenizer(examples, padding = 'max_length', truncation = True, max_length = 128)

chunk_size = 100
texts = data_frame['body'].astype(str).tolist()
tokenized_inputs = {'input_ids': [], 'attention_mask': []}
for i in tqdm(range(0, len(texts), chunk_size), desc="Tokenizing"):
    chunk_texts = texts[i:i + chunk_size]
    chunk_tokenized = tokenize_function(chunk_texts)
    tokenized_inputs['input_ids'].extend(chunk_tokenized['input_ids'])
    tokenized_inputs['attention_mask'].extend(chunk_tokenized['attention_mask'])

Tokenizing:   0%|          | 0/63134 [00:00<?, ?it/s]

In [None]:
class PredictionDataset(Dataset):

    def __init__(self, encodings):
        self.encodings = {key: torch.tensor(val).to(device).detach() for key, val in encodings.items()}

    def __getitem__(self, idx):
        if isinstance(idx, int):
            return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        elif isinstance(idx, list):
            return {key: torch.stack([val[i].clone().detach() for i in idx]) for key, val in self.encodings.items()}
        else:
            raise TypeError(f"Unsupported index type: {type(idx)}")

    def __len__(self):
        return len(self.encodings['input_ids'])

pred_dataset = PredictionDataset(tokenized_inputs)
pred_dataloader = DataLoader(pred_dataset, batch_size = 32, shuffle = False)
predictions = []
for batch in tqdm(pred_dataloader, desc="Predicting"):
    batch = {k: v.clone().detach().to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    threshold = 0.33
    softmax_scores = torch.nn.functional.softmax(outputs.logits.clone().detach(), dim = -1).cpu().numpy()
    preds = (softmax_scores[:, 1] >= threshold).astype(int)
    predictions.extend(preds.tolist())
    
data_frame['predicted_label'] = predictions
data_frame.to_csv('Data_Frame_Of_Texts_And_Predictions_By_BBHSBC_For_2020-07-01_To_2020-07-02.csv', index = False)

Predicting:   0%|          | 0/197293 [00:00<?, ?it/s]

In [None]:
data_frame.shape

In [None]:
data_frame.columns

In [None]:
data_frame.head()