In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

pd.options.display.max_colwidth = 300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model_name = "ruanchaves/bert-large-portuguese-cased-hatebr"


Device: cuda


In [2]:
file_path = "../data/raw/ErikakHilton-tweets.csv"
raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

duplicated_indexes = raw_df[raw_df.duplicated()].index
df = raw_df.drop(duplicated_indexes)

print(raw_df.shape, df.shape)


(22505, 49) (20505, 49)


  raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)


# Classification head function
def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    # print(f"get_class(): {logit=}, {scores=}, {label=}")
    return label


In [4]:
def process_batch_gpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().cpu().numpy()
    return logits


def process_batch_cpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().numpy()
    return logits


In [5]:
if device.type == "cuda":  # GPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    process_batch = process_batch_gpu

else:  # CPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    process_batch = process_batch_cpu


CPU times: user 3.35 s, sys: 2.14 s, total: 5.5 s
Wall time: 5.93 s


In [6]:
BATCH_SIZE = 50
logits_list = []

# for i in tqdm(range(0, len(df), BATCH_SIZE)):
for i in range(0, len(df), BATCH_SIZE):
    if i + BATCH_SIZE < len(df):
        batch = df.iloc[i : i + BATCH_SIZE].copy()
    else:
        batch = df.iloc[i : len(df)].copy()

    logits = process_batch(batch)
    logits_list.append(logits)


In [8]:
logits = np.concatenate(logits_list)
logits.shape

classes = []
for logit in logits:
    classes.append(get_class(logit))

df["BertL-offense"] = classes
df.shape


CPU times: user 370 ms, sys: 30.1 ms, total: 400 ms
Wall time: 396 ms


(20505, 50)

### Checking results

In [13]:
offensive_df = df[df["BertL-offense"] == True]
print(offensive_df.shape)
offensive_df.head()


(4482, 50)


Unnamed: 0,url,date,rawContent,renderedContent,id,user,replyCount,retweetCount,likeCount,quoteCount,...,user_favouritesCount,user_listedCount,user_mediaCount,user_location,user_protected,user_link,user_profileImageUrl,user_profileBannerUrl,user_label,BertL-offense
17,https://twitter.com/jorgeg_89/status/1587218453924122625,2022-10-31 23:02:23+00:00,@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J,@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J,1587218453924122625,https://twitter.com/jorgeg_89,1,0,10,0,...,4332,0,89,Santa Catarina,False,,https://pbs.twimg.com/profile_images/1597375984109584384/SMHOUF1b_normal.jpg,https://pbs.twimg.com/profile_banners/1246716975918788609/1666725131,,True
18,https://twitter.com/betomach/status/1587218933056258049,2022-10-31 23:04:17+00:00,"@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios","@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios",1587218933056258049,https://twitter.com/betomach,1,0,3,0,...,23027,0,138,sp,False,,https://pbs.twimg.com/profile_images/1533453602471104512/lCVyXw2w_normal.png,https://pbs.twimg.com/profile_banners/56263519/1653266803,,True
25,https://twitter.com/Dakota301022/status/1587220763832098819,2022-10-31 23:11:33+00:00,@ErikakHilton Acho q o aparelhado não se interessa em transparência,@ErikakHilton Acho q o aparelhado não se interessa em transparência,1587220763832098819,https://twitter.com/Dakota301022,0,0,1,0,...,694,0,51,"São Paulo, Brasil",False,,https://pbs.twimg.com/profile_images/1578011297316491266/hK6jHxqW_normal.jpg,https://pbs.twimg.com/profile_banners/1578010935616217089/1665062756,,True
29,https://twitter.com/LucianaMignoni/status/1587223004248956928,2022-10-31 23:20:28+00:00,@ErikakHilton Criancinhas birrentas kkk,@ErikakHilton Criancinhas birrentas kkk,1587223004248956928,https://twitter.com/LucianaMignoni,0,0,2,0,...,35786,0,835,"Sao Paulo, Brazil",False,,https://pbs.twimg.com/profile_images/1624566447325294594/jlVAqN93_normal.jpg,https://pbs.twimg.com/profile_banners/777954046753857536/1636937459,,True
32,https://twitter.com/NunesN21668663/status/1587223840018649090,2022-10-31 23:23:47+00:00,@ErikakHilton @Beleza1965Pura #PRFvergonhanacional 👈,@ErikakHilton @Beleza1965Pura #PRFvergonhanacional 👈,1587223840018649090,https://twitter.com/NunesN21668663,0,0,0,0,...,187482,1,60,,False,,https://pbs.twimg.com/profile_images/1601398382186094596/bE1EqVfa_normal.jpg,https://pbs.twimg.com/profile_banners/1423072401496158212/1670634257,,True


In [14]:
offensive_df["rawContent"].values


array(['@ErikakHilton É de uma vergonha sem tamanho... PRF tá praticamente apoiando esse circo do mi-mi-mi. https://t.co/zmaUHAOW2J',
       '@ErikakHilton Pior de tudo é q eles não estão fazendo nada, estão encostados do lado dos caminhominios',
       '@ErikakHilton Acho q o aparelhado não se interessa em transparência',
       ...,
       '@Filthiness @LulaOficial @Haddad_Fernando @marciofrancasp @Cortez50005 💜💜💜💜',
       '@marconythe1 @pedrorhuas \U0001faf6🏾\U0001faf6🏾',
       '#ovotoésecreto eu:\n- deputada estadual: @neon_cunha 50700;\n- deputada federal: @ErikakHilton 5070;\n- senador: @marciofrancasp 400;\n- governador: @Haddad_Fernando 13;\n- Presidente: @LulaOficial 13;\nPela democracia #LulaNo1ºTurno ! ☆☆☆☆☆☆☆☆☆☆☆☆☆☆'],
      dtype=object)

In [15]:
output_path = "../data/processed/"
output_file = f"erika_bert-large-portuguese-cased-hatebr_output.csv"

# Para salvar os dados, descomente as linhas abaixo
# df.to_csv(f"{output_path}{output_file}", sep=";", encoding="utf-8", index=False)
