In [1]:
from datasets import Dataset
from datetime import datetime
import numpy as np
import pandas as pd
import pytz
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from scipy.special import softmax

pd.options.display.max_colwidth = 300

In [2]:
# file_path = "../data/processed/sp_elected_stdep_tweets.csv"
# raw_df = pd.read_csv(file_path, on_bad_lines="skip", sep=";", encoding="utf-8")

file_name = "sp_elected_feddep_tweets"
file_format = "parquet"
file_path = f"../data/processed/{file_name}.{file_format}"
raw_df = pd.read_parquet(file_path)

print(f'{raw_df.shape=}')
raw_df.info()

raw_df.shape=(215622, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215622 entries, 0 to 215621
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   id                 215622 non-null  object             
 1   date               215622 non-null  datetime64[ns, UTC]
 2   user               215622 non-null  object             
 3   content            215622 non-null  object             
 4   in_reply_to_id     207130 non-null  object             
 5   in_reply_to_user   206356 non-null  object             
 6   conversation_id    215622 non-null  object             
 7   conversation_user  215622 non-null  object             
 8   reply_count        215622 non-null  int64              
 9   retweet_count      215622 non-null  int64              
 10  like_count         215622 non-null  int64              
 11  quote_count        215622 non-null  int64              
 12  view

In [3]:
# Criando cópia antes de aplicar os filtros
df = raw_df

In [4]:
# Filtrando somente os tweets do período desejado
local_tz = pytz.timezone("America/Sao_Paulo")
since = datetime(year=2022, month=9, day=1, tzinfo=local_tz)
until = datetime(year=2022, month=11, day=1, tzinfo=local_tz)

df = df[
    (df['date'] >= since ) &
    (df['date'] <= until )
]
df.shape

(78226, 13)

In [5]:
# Removendo tweets com informação faltando de reply
# Se 'user' é diferente de 'conversation_user', então o tweet deveria ter também 'in_reply_to_user'
# Se não tiver, é por que o tweet respondido foi apagado, ou houve algum erro durante o scraping
df = df[
    ~((df['in_reply_to_user'].isnull()) &
    (df['user'] != df['conversation_user']))
]
df.shape

(78193, 13)

In [6]:
# Avaliando a distribuição dos tweets entre os candidatos
print(f"df['conversation_user'] mais comuns: \n{(df['conversation_user'].value_counts(normalize=True)[:5])}\n")
print(f"df['user'] mais comuns: \n{(df['user'].value_counts(normalize=True)[:5])}\n")

df['conversation_user'] mais comuns: 
samiabomfim        0.504163
rsallesmma         0.197076
luizaerundina      0.107554
marcofeliciano     0.085992
pauloteixeira13    0.072027
Name: conversation_user, dtype: float64

df['user'] mais comuns: 
samiabomfim        0.010500
rosangelamorosp    0.008505
julianapt          0.005064
luizaerundina      0.003658
felipebecari       0.003415
Name: user, dtype: float64



# Pipeline classification

In [7]:
OFFENSE_MODELS = {
    "rc_bert_base": "ruanchaves/bert-base-portuguese-cased-hatebr",
    "rc_mdeberta_base": "ruanchaves/mdeberta-v3-base-hatebr",
    "cl_distilbert_base": "citizenlab/distilbert-base-multilingual-cased-toxicity",
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [8]:
indexes = df.content.str.len().sort_values().index
tweets = df.reindex(indexes)
tweets = tweets.reset_index(drop=True)

In [9]:
col_name = "content"
batch_size = 32
num_batches = len(tweets) // batch_size + 1
results = {}

def classify(model_name, tweets, col_name="content", batch_size=32):
    num_batches = len(tweets) // batch_size + 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == "cuda":
        classifier = pipeline("sentiment-analysis", model=model_name, device=0)
    else:
        classifier = pipeline("sentiment-analysis", model=model_name)
    
    results = []
    for i in tqdm(range(num_batches)):
       batch_start = i * batch_size
       batch_end = min((i + 1) * batch_size, len(tweets))
       batch_texts = tweets[col_name][batch_start:batch_end].tolist()
       batch_results = classifier(batch_texts)
       results += batch_results
    return results

# for model_key in OFFENSE_MODELS.keys():
#     results[model_key] = classify(OFFENSE_MODELS[model_key], tweets)

In [10]:
%%time
model_key = 'rc_bert_base'
results[model_key] = classify(OFFENSE_MODELS[model_key], tweets)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2444/2444 [11:44<00:00,  3.47it/s]

CPU times: user 10min 26s, sys: 1min 20s, total: 11min 46s
Wall time: 11min 47s





In [11]:
%%time
model_key = 'rc_mdeberta_base'
results[model_key] = classify(OFFENSE_MODELS[model_key], tweets)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2444/2444 [21:38<00:00,  1.88it/s]

CPU times: user 19min 3s, sys: 2min 38s, total: 21min 42s
Wall time: 21min 41s





In [18]:
%%time
model_key = 'cl_distilbert_base'
results[model_key] = classify(OFFENSE_MODELS[model_key], tweets)

100%|█████████████████████████████████████████████████████████████████████████████| 2444/2444 [07:01<00:00,  5.79it/s]

CPU times: user 6min 15s, sys: 49.6 s, total: 7min 4s
Wall time: 7min 5s





In [19]:
# for key in OFFENSE_MODELS.keys():
for key in results.keys():
    print(key)
    df[f'{key}_label'] = [result["label"] for result in results[key]]
    df[f'{key}_score'] = [result["score"] for result in results[key]]

df.info()

rc_bert_base
rc_mdeberta_base
cl_distilbert_base
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78193 entries, 220 to 215619
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        78193 non-null  object             
 1   date                      78193 non-null  datetime64[ns, UTC]
 2   user                      78193 non-null  object             
 3   content                   78193 non-null  object             
 4   in_reply_to_id            75049 non-null  object             
 5   in_reply_to_user          74540 non-null  object             
 6   conversation_id           78193 non-null  object             
 7   conversation_user         78193 non-null  object             
 8   reply_count               78193 non-null  int64              
 9   retweet_count             78193 non-null  int64              
 10  like_count                7819

In [20]:
df.cl_distilbert_base_label.value_counts()

not_toxic    74638
toxic         3555
Name: cl_distilbert_base_label, dtype: int64

In [21]:
# Convertendo labels do distilbert para booleano
def get_distilbert_label(label):
    if label == "toxic":
        return True
    return False
    
df['cl_distilbert_base_label'] = df.cl_distilbert_base_label.apply(get_distilbert_label)

In [22]:
def get_offense_label_sum(row):
    count = 0
    if row['rc_bert_base_label'] == True:
        count += 1
    if row['rc_mdeberta_base_label'] == True:
        count += 1
    if row['cl_distilbert_base_label'] == True:
        count += 1
    return count

df['label_sum'] = df.apply(get_offense_label_sum, axis=1)
df['label_sum'].value_counts()

0    39119
1    20390
2    16737
3     1947
Name: label_sum, dtype: int64

# Viewing data

In [23]:
# df[['content', 'rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].value_counts()

rc_bert_base_label  rc_mdeberta_base_label  cl_distilbert_base_label
False               False                   False                       39119
                    True                    False                       16143
True                True                    False                       16128
                    False                   False                        3248
                    True                    True                         1947
False               False                   True                          999
                    True                    True                          442
True                False                   True                          167
dtype: int64

In [24]:
df[['rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']].describe()

Unnamed: 0,rc_bert_base_label,rc_mdeberta_base_label,cl_distilbert_base_label
count,78193,78193,78193
unique,2,2,2
top,False,False,False
freq,56703,43533,74638


In [25]:
df[df["label_sum"] == 3][['content', 'rc_bert_base_label', 'rc_mdeberta_base_label', 'cl_distilbert_base_label']]

Unnamed: 0,content,rc_bert_base_label,rc_mdeberta_base_label,cl_distilbert_base_label
667,VEJA COMO A CORRUPÇÃO PETISTA DESTRUIU A SAÚDE E A EDUCAÇÃO! https://t.co/BKpepJjLGV,True,True,True
4329,"Chegamos pra plenária de arrancada da vitória, com @LulaOficial presidente e @Haddad_Fernando governador ⭐️🚩 https://t.co/1XzCPKHXM3",True,True,True
13742,@luizaerundina Tu e mentirosa,True,True,True
14182,"@luizaerundina Bem esculachado e Bolsonaro será ex presidente sim, em 2027",True,True,True
14325,@luizaerundina https://t.co/83MUXC2LDv,True,True,True
...,...,...,...,...
209671,@samiabomfim Você ficou estarrecida com a igreja do Chile quando pegou fogo? Ou é por que um institudo têm mais facilidade de extorquir dinheiro do orçamento? que é diferente de pedir (dismo). TODO INSTRUMENTO DE DEPREDAÇÃO DA RENDA DO TRABALHADOR É O QUE VOCÊ DEFENDE. Vai um boleto aí?,True,True,True
209674,@samiabomfim ETA pessoa ignorante .,True,True,True
209675,@samiabomfim Fake,True,True,True
209676,@samiabomfim O direito já foi evento de golpe militar https://t.co/OZKAiSXBp8,True,True,True


# Exporting data

In [26]:
output_path = "../data/processed/"
output_suffix = "hf_classified"
output_file = f"{output_path}{file_name}-hf_classified.{file_format}"

print(output_file)

# Para salvar os dados, descomente as linhas abaixo
if file_format == 'csv':
    df.to_csv(output_file, sep=";", encoding="utf-8", index=False)

if file_format == 'parquet':
    df.to_parquet(output_file)
    

../data/processed/sp_elected_feddep_tweets-hf_classified.parquet


# Step by step classification

Código experimental abaixo, ainda precisa de mais testes e refinamento

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)


# Classification head function
def get_class(logit):
    scores = softmax(logit)
    label = config.id2label[np.argmax(scores)]
    # print(f"get_class(): {logit=}, {scores=}, {label=}")
    return label


In [None]:
def process_batch_gpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().cpu().numpy()
    return logits


def process_batch_cpu(batch):
    model_input = tokenizer(
        *(list(batch["rawContent"]),), padding=True, return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**model_input)
        logits = outputs.logits.detach().numpy()
    return logits


In [None]:
if device.type == "cuda":  # GPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    process_batch = process_batch_gpu

else:  # CPU
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    process_batch = process_batch_cpu


In [None]:
BATCH_SIZE = 50
logits_list = []

# for i in tqdm(range(0, len(df), BATCH_SIZE)):
for i in range(0, len(df), BATCH_SIZE):
    if i + BATCH_SIZE < len(df):
        batch = df.iloc[i : i + BATCH_SIZE].copy()
    else:
        batch = df.iloc[i : len(df)].copy()

    logits = process_batch(batch)
    logits_list.append(logits)


In [None]:
logits = np.concatenate(logits_list)
logits.shape

classes = []
for logit in logits:
    classes.append(get_class(logit))

df["BertL-offense"] = classes
df.shape


### Checking results

In [None]:
offensive_df = df[df["BertL-offense"] == True]
print(offensive_df.shape)
offensive_df.head()


In [None]:
offensive_df["rawContent"].values
