In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

pd.options.display.max_colwidth = 300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model_name = "ruanchaves/bert-large-portuguese-cased-hatebr"

Device: cuda


In [2]:
file_path = "../data/raw/ErikakHilton-tweets.csv"
raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

duplicated_indexes = raw_df[raw_df.duplicated()].index
df = raw_df.drop(duplicated_indexes)

print(raw_df.shape, df.shape)

(22505, 49) (20505, 49)


  raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")


In [3]:
sample_df = df[:100]
sample_df.shape

(100, 49)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

# Classification head function
def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    # print(f"get_class(): {logit=}, {scores=}, {label=}")
    return label

In [5]:
%%time
if device == 'cuda':  # GPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    
else:  # CPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

CPU times: user 2.54 s, sys: 1.18 s, total: 3.72 s
Wall time: 3.67 s


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [6]:
# TODO: mudar para processamento em batches
# https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt#dynamic-padding

In [13]:
%%time
if device == 'cuda':  # GPU
    model_input = tokenizer(
        *(list(sample_df["rawContent"]),), padding=True, truncation=True, return_tensors="pt"
    ).to(device)
else:  # CPU
    model_input = tokenizer(
        *(list(sample_df["rawContent"]),), padding=True, truncation=True, return_tensors="pt"
    )    

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


CPU times: user 8.68 ms, sys: 28.3 ms, total: 37 ms
Wall time: 14.7 ms


In [14]:
%%time
if device == 'cuda':  # GPU
    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().cpu().numpy()

else:  # CPU
    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().numpy()

CPU times: user 1min 30s, sys: 16.3 s, total: 1min 47s
Wall time: 13.4 s


In [15]:
%%time
classes = []
for logit in logits:
    classes.append(get_class(logit))

sample_df["BertL-offense"] = classes
sample_df.shape

CPU times: user 0 ns, sys: 6.43 ms, total: 6.43 ms
Wall time: 5.77 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(100, 50)

### Checking results

In [16]:
offensive_df = sample_df[sample_df["BertL-offense"] == True]
print(offensive_df.shape)
offensive_df.head()

(18, 50)


Unnamed: 0,url,date,rawContent,renderedContent,id,user,replyCount,retweetCount,likeCount,quoteCount,...,user_favouritesCount,user_listedCount,user_mediaCount,user_location,user_protected,user_link,user_profileImageUrl,user_profileBannerUrl,user_label,BertL-offense
17,https://twitter.com/jorgeg_89/status/1587218453924122625,2022-10-31 23:02:23+00:00,@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J,@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J,1587218453924122625,https://twitter.com/jorgeg_89,1,0,10,0,...,4332,0,89,Santa Catarina,False,,https://pbs.twimg.com/profile_images/1597375984109584384/SMHOUF1b_normal.jpg,https://pbs.twimg.com/profile_banners/1246716975918788609/1666725131,,True
18,https://twitter.com/betomach/status/1587218933056258049,2022-10-31 23:04:17+00:00,"@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios","@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios",1587218933056258049,https://twitter.com/betomach,1,0,3,0,...,23027,0,138,sp,False,,https://pbs.twimg.com/profile_images/1533453602471104512/lCVyXw2w_normal.png,https://pbs.twimg.com/profile_banners/56263519/1653266803,,True
25,https://twitter.com/Dakota301022/status/1587220763832098819,2022-10-31 23:11:33+00:00,@ErikakHilton Acho q o aparelhado não se interessa em transparência,@ErikakHilton Acho q o aparelhado não se interessa em transparência,1587220763832098819,https://twitter.com/Dakota301022,0,0,1,0,...,694,0,51,"São Paulo, Brasil",False,,https://pbs.twimg.com/profile_images/1578011297316491266/hK6jHxqW_normal.jpg,https://pbs.twimg.com/profile_banners/1578010935616217089/1665062756,,True
29,https://twitter.com/LucianaMignoni/status/1587223004248956928,2022-10-31 23:20:28+00:00,@ErikakHilton Criancinhas birrentas kkk,@ErikakHilton Criancinhas birrentas kkk,1587223004248956928,https://twitter.com/LucianaMignoni,0,0,2,0,...,35786,0,835,"Sao Paulo, Brazil",False,,https://pbs.twimg.com/profile_images/1624566447325294594/jlVAqN93_normal.jpg,https://pbs.twimg.com/profile_banners/777954046753857536/1636937459,,True
32,https://twitter.com/NunesN21668663/status/1587223840018649090,2022-10-31 23:23:47+00:00,@ErikakHilton @Beleza1965Pura #PRFvergonhanacional 👈,@ErikakHilton @Beleza1965Pura #PRFvergonhanacional 👈,1587223840018649090,https://twitter.com/NunesN21668663,0,0,0,0,...,187482,1,60,,False,,https://pbs.twimg.com/profile_images/1601398382186094596/bE1EqVfa_normal.jpg,https://pbs.twimg.com/profile_banners/1423072401496158212/1670634257,,True


In [12]:
offensive_df['rawContent'].values

array(['@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J',
       '@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios',
       '@ErikakHilton Acho q o aparelhado não se interessa em transparência',
       '@ErikakHilton Criancinhas birrentas kkk',
       '@ErikakHilton @Beleza1965Pura #PRFvergonhanacional 👈',
       '@ErikakHilton a mama Erika mal chegou e já ta botando ordem em tudo',
       '@ErikakHilton Eu vou perder um dia de trabalho e se bobear tomar uma medida por causa desse bloqueio onde a PRF tá apoiando os mimizentos',
       '@ErikakHilton #PRFvergonhanacional #PRFprevaricando',
       '@ErikakHilton Estou a 1h aqui no Rio por conta dessa bomba \U0001f972',
       '@ErikakHilton #LulaPresidente2023 #BolsonaroNaCadeia',
       '@ErikakHilton Gatan, toca esses véio de Guarulhos!\n\nEu quero ir dormir mermã kkkkkkk',
       '@ErikakHilton Pega eles ma