In [1]:
import sys

import numpy as np

import torch
from torch.utils.data import DataLoader

from transformers import DistilBertTokenizer

import pickle as pkl

sys.path.insert(0, '..')
from src.data_collection import get_data
from src.models import (
    HateDataset, DistilBERTMultiClass, get_distil_hyperparams, predict_distilbert,
    predict_bert_tweet, prepare_dataloaders
)

In [2]:
hate_speech_ucb = get_data()
HYPERPARAMS = get_distil_hyperparams()

model_path = HYPERPARAMS["MODEL_PATH"]
vocab_path = HYPERPARAMS["VOCAB_PATH"]

TOKENIZER = DistilBertTokenizer.from_pretrained(vocab_path)
MAX_LEN = HYPERPARAMS["MAX_LEN"]
device = HYPERPARAMS["DEVICE"]

N_SAMPLES = 20

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!




In [3]:
hate_speech_sample = hate_speech_ucb[:N_SAMPLES]#.sample(N_SAMPLES, random_state=585)
hate_speech_sample

Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present ...,0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0
5,""" *gulhfg* that's the sound of your mom suckin...",1
6,""" Did you notice a sign out front that said de...",1
7,""" F those ugly idiots "" "" I cant stand useless...",0
8,""" Fuck you niggas ion give a fuck no more nigg...",1
9,""" Happy Independence Day to all my fellow sout...",0


In [4]:
dataset = HateDataset(hate_speech_sample, TOKENIZER, MAX_LEN)

In [5]:
sample_params = HYPERPARAMS["TEST_PARAMS"]

sample_loader = DataLoader(dataset, **sample_params)

In [6]:
N_CLASSES = hate_speech_sample["hatespeech"].nunique()

In [7]:
distil_model = DistilBERTMultiClass(n_classes=N_CLASSES)
distil_model.load_state_dict(torch.load(model_path))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [8]:
with open("../models/bertweet_large_pkl",'rb') as f:
    bertweet_large_model = pkl.load(f)

In [9]:
BERTWEET_MODEL = "vinai/bertweet-large"
batch_size = 1

In [10]:
bertweet_dataloader = prepare_dataloaders(hate_speech_sample, BERTWEET_MODEL, batch_size)

In [11]:
bertweet_outputs = predict_bert_tweet(bertweet_large_model, bertweet_dataloader, device)
bertweet_outputs = np.argmax(bertweet_outputs, axis = 1)

In [12]:
distil_outputs = predict_distilbert(distil_model, sample_loader, device)
distil_outputs = np.argmax(distil_outputs, axis=1)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
20it [00:00, 94.76it/s]


In [13]:
hate_speech_sample["DistilBERT_pred"] = distil_outputs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["DistilBERT_pred"] = distil_outputs


In [14]:
hate_speech_sample["BERTweet_Large_pred"] = bertweet_outputs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["BERTweet_Large_pred"] = bertweet_outputs


In [15]:
hate_speech_sample

Unnamed: 0,text,hatespeech,DistilBERT_pred,BERTweet_Large_pred
0,! thank u! im transmasc and generally present ...,0,0,0
1,!Go fuck yourself faggot!,1,1,1
2,!flair [I love women and minorities],0,0,0
3,!flair [death to all niggers and gays],1,1,1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0,0,0
5,""" *gulhfg* that's the sound of your mom suckin...",1,1,1
6,""" Did you notice a sign out front that said de...",1,1,1
7,""" F those ugly idiots "" "" I cant stand useless...",0,0,0
8,""" Fuck you niggas ion give a fuck no more nigg...",1,1,1
9,""" Happy Independence Day to all my fellow sout...",0,0,0
