In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, pipeline
from scipy.special import softmax

pd.options.display.max_colwidth = 300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [2]:
file_path = "../data/processed/sp_elected_state_deputies_tweets.csv"
raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

duplicated_indexes = raw_df[raw_df.duplicated()].index
df = raw_df.drop(duplicated_indexes)

print(raw_df.shape, df.shape)


(102558, 14) (102558, 14)


# Pipeline classification

In [3]:
OFFENSE_MODELS = {
    "rc_bert_base": "ruanchaves/bert-base-portuguese-cased-hatebr",
    "rc_mdeberta_base": "ruanchaves/mdeberta-v3-base-hatebr",
    "cl_distilbert_base": "citizenlab/distilbert-base-multilingual-cased-toxicity",
}

In [5]:
col_name = "content"
batch_size = 50
num_batches = len(df) // batch_size + 1
results = {}

for model_key, model_name in OFFENSE_MODELS.items():
    if device.type == "cuda":
        classifier = pipeline("sentiment-analysis", model=model_name, device=0)
    else:
        classifier = pipeline("sentiment-analysis", model=model_name)

    results[model_key] = []
    for i in tqdm(range(num_batches)):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(df))
        batch_texts = df[col_name][batch_start:batch_end].tolist()
        batch_results = classifier(batch_texts)
        results[model_key] += batch_results

100%|████████████████████████████████████████████████████████████████████████████████████| 2052/2052 [13:18<00:00,  2.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2052/2052 [28:35<00:00,  1.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2052/2052 [08:27<00:00,  4.05it/s]


In [24]:
for key in OFFENSE_MODELS.keys():
    print(key)
    df[f'{key}_label'] = [result["label"] for result in results[key]]
    df[f'{key}_score'] = [result["score"] for result in results[key]]

df.info()

rc_bert_base
rc_mdeberta_base
cl_distilbert_base
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102558 entries, 0 to 102557
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   url                       102558 non-null  object 
 1   date                      102558 non-null  object 
 2   content                   102558 non-null  object 
 3   user                      102558 non-null  object 
 4   reply_count               102558 non-null  int64  
 5   retweet_count             102558 non-null  int64  
 6   like_count                102558 non-null  int64  
 7   quote_count               102558 non-null  int64  
 8   in_reply_to_id            94389 non-null   float64
 9   in_reply_to_user          94122 non-null   object 
 10  conversation_id           102558 non-null  object 
 11  conversation_user         102558 non-null  object 
 12  class_label               102558 non-null  bool   


# Viewing data

In [26]:
# df[['content', 'rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()

rc_bert_base_label  rc_mdeberta_base_label  cl_distilbert_base_label
False               False                   False                       69168
True                True                    False                       17416
False               True                    False                        7253
True                False                   False                        4557
                    True                    True                         2006
False               False                   True                         1597
                    True                    True                          378
True                False                   True                          183
dtype: int64

In [27]:
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].describe()

Unnamed: 0,rc_bert_base_label,rc_mdeberta_base_label,cl_distilbert_base_label
count,102558,102558,102558
unique,2,2,2
top,False,False,False
freq,78396,75505,98394


# Exporting data

In [30]:
output_path = "../data/processed/"
output_file = f"classified.csv"

# Para salvar os dados, descomente as linhas abaixo
df.to_csv(f"{output_path}{output_file}", sep=";", encoding="utf-8", index=False)

# Step by step classification

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)


# Classification head function
def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    # print(f"get_class(): {logit=}, {scores=}, {label=}")
    return label


In [11]:
def process_batch_gpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().cpu().numpy()
    return logits


def process_batch_cpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().numpy()
    return logits


In [12]:
if device.type == "cuda":  # GPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    process_batch = process_batch_gpu

else:  # CPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    process_batch = process_batch_cpu


In [13]:
BATCH_SIZE = 50
logits_list = []

# for i in tqdm(range(0, len(df), BATCH_SIZE)):
for i in range(0, len(df), BATCH_SIZE):
    if i + BATCH_SIZE < len(df):
        batch = df.iloc[i : i + BATCH_SIZE].copy()
    else:
        batch = df.iloc[i : len(df)].copy()

    logits = process_batch(batch)
    logits_list.append(logits)


KeyboardInterrupt: 

In [None]:
logits = np.concatenate(logits_list)
logits.shape

classes = []
for logit in logits:
    classes.append(get_class(logit))

df["BertL-offense"] = classes
df.shape


### Checking results

In [None]:
offensive_df = df[df["BertL-offense"] == True]
print(offensive_df.shape)
offensive_df.head()


In [None]:
offensive_df["rawContent"].values


In [None]:
output_path = "../data/processed/"
output_file = f"erika_bert-large-portuguese-cased-hatebr_output.csv"

# Para salvar os dados, descomente as linhas abaixo
# df.to_csv(f"{output_path}{output_file}", sep=";", encoding="utf-8", index=False)
