In [1]:
# Full classification example from: https://huggingface.co/ruanchaves/bert-large-portuguese-cased-hatebr


# from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
# import numpy as np
# import torch
# from scipy.special import softmax

# model_name = "ruanchaves/bert-large-portuguese-cased-hatebr"
# s1 = "Quem não deve não teme!!"
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# config = AutoConfig.from_pretrained(model_name)
# model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")
# with torch.no_grad():
#     output = model(**model_input)
#     scores = output[0][0].detach().numpy()
#     scores = softmax(scores)
#     ranking = np.argsort(scores)
#     ranking = ranking[::-1]
#     for i in range(scores.shape[0]):
#         l = config.id2label[ranking[i]]
#         s = scores[ranking[i]]
#         print(f"{i+1}) Label: {l} Score: {np.round(float(s), 4)}")


In [2]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax


In [3]:
model_name = "ruanchaves/bert-large-portuguese-cased-hatebr"

config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    return label


In [4]:
file_path = "../data/raw/ErikakHilton-tweets.csv"
raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

duplicated_indexes = raw_df[raw_df.duplicated()].index
df = raw_df.drop(duplicated_indexes)

print(raw_df.shape, df.shape)

(22505, 49) (20505, 49)


  raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")


In [5]:
sample_df = df[:50]
sample_df.shape

(50, 49)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


### Testing with CPU

In [7]:
cpu_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [8]:
%%timeit -n 10
model_input = tokenizer(
    *(list(sample_df["rawContent"]),), padding=True, return_tensors="pt"
)

with torch.no_grad():
    outputs = cpu_model(**model_input)
    logits = outputs.logits.detach().numpy()

6.74 s ± 44.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
classes = []
for logit in tqdm(logits):
    classes.append(get_class(logit))

sample_df["BertL-offense"] = classes

NameError: name 'logits' is not defined

### Testing with GPU

In [14]:
gpu_model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

In [22]:
%%timeit -n 10

model_input = tokenizer(
    *(list(sample_df["rawContent"]),), padding=True, return_tensors="pt"
).to(device)

with torch.no_grad():
    outputs = gpu_model(**model_input)
    logits = outputs.logits.detach().cpu().numpy()

431 ms ± 6.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
classes = []
for logit in tqdm(logits):
    classes.append(get_class(logit))

sample_df["BertL-offense"] = classes

NameError: name 'logits' is not defined

### Checking results

In [None]:
pd.options.display.max_colwidth = 300
sample_df[sample_df["BertL-offense"] == True][['rawContent']]