In [1]:
from datasets import Dataset
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from scipy.special import softmax

pd.options.display.max_colwidth = 300

In [35]:
#file_path = "../data/processed/sp_elected_state_deputies_tweets.csv"
file_path = "../data/processed/classified.csv"
raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

duplicated_indexes = raw_df[raw_df.duplicated()].index
df = raw_df.drop(duplicated_indexes)

print(f'{raw_df.shape=}')
print(f'{df.shape=}')

raw_df.shape=(102558, 18)
df.shape=(102558, 18)


# Pipeline classification

In [36]:
OFFENSE_MODELS = {
    "rc_bert_base": "ruanchaves/bert-base-portuguese-cased-hatebr",
    "rc_mdeberta_base": "ruanchaves/mdeberta-v3-base-hatebr",
    "cl_distilbert_base": "citizenlab/distilbert-base-multilingual-cased-toxicity",
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [37]:
indexes = df.content.str.len().sort_values().index
tweets = df.reindex(indexes)
tweets = tweets.reset_index(drop=True)

In [None]:
%%time
col_name = "content"
batch_size = 64
num_batches = len(tweets) // batch_size + 1
results = {}

for model_key, model_name in OFFENSE_MODELS.items():
    if device.type == "cuda":
        classifier = pipeline("sentiment-analysis", model=model_name, device=0)
    else:
        classifier = pipeline("sentiment-analysis", model=model_name)

    results[model_key] = []
    for i in tqdm(range(num_batches)):
       batch_start = i * batch_size
       batch_end = min((i + 1) * batch_size, len(tweets))
       batch_texts = tweets[col_name][batch_start:batch_end].tolist()
       batch_results = classifier(batch_texts)
       results[model_key] += batch_results

In [33]:
# batch_size = 32
# 100%|██████████████████████████████████████████████████████████████████████████████████| 3205/3205 [12:29<00:00,  4.28it/s]
# 100%|████████████████████████████████████████████████████████████████████████████████████████| 3205/3205 [29:32<00:00,  1.81it/s]
# 100%|████████████████████████████████████████████████████████████████████████████████████████| 3205/3205 [08:17<00:00,  6.44it/s]
# CPU times: user 44min 22s, sys: 6min 8s, total: 50min 30s
# Wall time: 50min 30s

In [9]:
for key in OFFENSE_MODELS.keys():
    print(key)
    df[f'{key}_label'] = [result["label"] for result in results[key]]
    df[f'{key}_score'] = [result["score"] for result in results[key]]

df.info()

rc_bert_base
rc_mdeberta_base
cl_distilbert_base
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102558 entries, 0 to 102557
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   url                       102558 non-null  object 
 1   date                      102558 non-null  object 
 2   content                   102558 non-null  object 
 3   user                      102558 non-null  object 
 4   reply_count               102558 non-null  int64  
 5   retweet_count             102558 non-null  int64  
 6   like_count                102558 non-null  int64  
 7   quote_count               102558 non-null  int64  
 8   in_reply_to_id            94389 non-null   float64
 9   in_reply_to_user          94122 non-null   object 
 10  conversation_id           102558 non-null  object 
 11  conversation_user         102558 non-null  object 
 12  rc_bert_base_label        102558 non-null  bool   


In [15]:
df.cl_distilbert_base_label.value_counts()

not_toxic    98394
toxic         4164
Name: cl_distilbert_base_label, dtype: int64

In [None]:
# Convertendo labels do distilbert para booleano
def get_distilbert_label(label):
    if label == "toxic":
        return True
    return False
    
df['cl_distilbert_base_label'] = df.cl_distilbert_base_label.apply(get_distilbert_label)

In [21]:
def get_offense_label_sum(row):
    count = 0
    if row['rc_bert_base_label'] == True:
        count += 1
    if row['rc_mdeberta_base_label'] == True:
        count += 1
    if row['cl_distilbert_base_label'] == True:
        count += 1
    return count

df['label_sum'] = df.apply(get_offense_label_sum, axis=1)
df['label_sum'].value_counts()

0    69168
2    17977
1    13407
3     2006
Name: label_sum, dtype: int64

# Viewing data

In [23]:
# df[['content', 'rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()

rc_bert_base_label  rc_mdeberta_base_label  cl_distilbert_base_label
False               False                   False                       69168
True                True                    False                       17416
False               True                    False                        7253
True                False                   False                        4557
                    True                    True                         2006
False               False                   True                         1597
                    True                    True                          378
True                False                   True                          183
dtype: int64

In [24]:
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].describe()

Unnamed: 0,rc_bert_base_label,rc_mdeberta_base_label,cl_distilbert_base_label
count,102558,102558,102558
unique,2,2,2
top,False,False,False
freq,78396,75505,98394


In [32]:
df[df["label_sum"] == 3][['content', 'rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']]

Unnamed: 0,content,rc_bert_base_label,rc_mdeberta_base_label,cl_distilbert_base_label
391,"@augustodeAB Obrigada, amigo!",True,True,True
827,@andreawerner_ Ainda não sei.,True,True,True
1977,@andreawerner_ Sergio Moro bandido vagabundo,True,True,True
2936,@andreawerner_ Eita p@rra!\nSó faltava!,True,True,True
3463,@andreawerner_ @BaixadorDeVideo,True,True,True
...,...,...,...,...
102462,"A Defesa Civil está realizando seu trabalho e Governo do Estado está dando total assistência às prefeituras para atender aos cidadãos, cuidando da população, contem sempre comigo!",True,True,True
102471,Catracas do metro de São Paulo vão aceitar cartão de crédito e débito com NFC!\n\nProjeto-piloto da Autopass e da Secretaria de Transportes Metropolitanos vai colocar máquinas com função contactless no embarque.,True,True,True
102474,"@xerifedoconsum por favor deputado ,averigue a situação da aprovação que o STF ,aprovou de que os inadimplentes com dívidas iram fica sem sua CNH e passaporte , por favor https://t.co/bZiuNlwwFq",True,True,True
102479,"O instituo Adolfo Lutz, juntamente com o Departamento de Nutrição da Faculdade de Saúde Pública da Universidade de São Paulo.",True,True,True


# Exporting data

In [25]:
output_path = "../data/processed/"
output_file = f"classified-tweets.csv"

# Para salvar os dados, descomente as linhas abaixo
df.to_csv(f"{output_path}{output_file}", sep=";", encoding="utf-8", index=False)

# Step by step classification

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)


# Classification head function
def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    # print(f"get_class(): {logit=}, {scores=}, {label=}")
    return label


In [11]:
def process_batch_gpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().cpu().numpy()
    return logits


def process_batch_cpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().numpy()
    return logits


In [12]:
if device.type == "cuda":  # GPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    process_batch = process_batch_gpu

else:  # CPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    process_batch = process_batch_cpu


In [None]:
BATCH_SIZE = 50
logits_list = []

# for i in tqdm(range(0, len(df), BATCH_SIZE)):
for i in range(0, len(df), BATCH_SIZE):
    if i + BATCH_SIZE < len(df):
        batch = df.iloc[i : i + BATCH_SIZE].copy()
    else:
        batch = df.iloc[i : len(df)].copy()

    logits = process_batch(batch)
    logits_list.append(logits)


In [None]:
logits = np.concatenate(logits_list)
logits.shape

classes = []
for logit in logits:
    classes.append(get_class(logit))

df["BertL-offense"] = classes
df.shape


### Checking results

In [None]:
offensive_df = df[df["BertL-offense"] == True]
print(offensive_df.shape)
offensive_df.head()


In [None]:
offensive_df["rawContent"].values
