In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

plt.style.use("ggplot")
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

import warnings

warnings.filterwarnings("ignore")

In [2]:
# Carregando os dados
# file_path = "../data/processed/sp_elected_stdep_tweets.csv"
# df = pd.read_csv(file_path, sep=";", encoding="utf-8")

file_name = "sp_elected_feddep_tweets-hf_classified"
file_format = "parquet"
file_path = f"../data/processed/{file_name}.{file_format}"
df = pd.read_parquet(file_path)

print(df.shape)
df.head()

(78193, 20)


Unnamed: 0,id,date,user,content,in_reply_to_id,in_reply_to_user,conversation_id,conversation_user,reply_count,retweet_count,like_count,quote_count,view_count,rc_bert_base_label,rc_bert_base_score,rc_mdeberta_base_label,rc_mdeberta_base_score,cl_distilbert_base_label,cl_distilbert_base_score,label_sum
220,1587223793109504000,2022-10-31 23:23:36+00:00,adriventurasp,"O presidente precisa se pronunciar logo, aceit...",,,1587223793109504000,adriventurasp,309,22,444,5,,False,0.814559,False,0.999961,False,0.952127,0
221,1587106021759139840,2022-10-31 15:35:37+00:00,adriventurasp,Desejo que o presidente eleito nos surpreenda ...,,,1587106021759139840,adriventurasp,57,11,136,1,,False,0.814559,False,0.999972,False,0.952127,0
222,1585424916853264389,2022-10-27 00:15:30+00:00,adriventurasp,"Além disso, as emendas de relator, chamadas de...",1.585424914886316e+18,adriventurasp,1585424914886316032,adriventurasp,1,1,26,0,,False,0.814559,False,0.999975,False,0.952127,0
223,1585424914886316032,2022-10-27 00:15:30+00:00,adriventurasp,"As emendas de relator, ou Orçamento Secreto, d...",,,1585424914886316032,adriventurasp,5,5,41,1,,False,0.814559,False,0.99996,False,0.952127,0
224,1585345930450329600,2022-10-26 19:01:38+00:00,adriventurasp,O Supremo respaldou ontem a decisão do TSE que...,,,1585345930450329600,adriventurasp,0,8,56,0,,False,0.814559,False,0.999961,False,0.952127,0


In [8]:
# Diretório onde modelo foi salvo
model_path = "fine_tuned_model"

# Carregando tokenizer e modelo
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Escolhendo o dispositivo adequado (CPU or GPU) caso haja mais que um disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Criando pipeline customizado de analise de sentimento
# OBS: possível instanciar o modelo sem o tokenizer e device
classifier = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=torch.cuda.current_device(),
)
print(classifier)

<transformers.pipelines.text_classification.TextClassificationPipeline object at 0x7fe5a15db640>


In [9]:
%%time
# Classificando tweets com modelo refinado
col_name = 'content'
results = df[col_name].apply(lambda x: classifier(x))

CPU times: user 8min 54s, sys: 1min 4s, total: 9min 58s
Wall time: 9min 59s


In [10]:
# Separando rotulo e score
df["ft_rc_bert_base_label"] = [result[0]["label"] for result in results]
df["ft_rc_bert_base_score"] = [result[0]["score"] for result in results]

## Exportando dados

In [13]:
base_file_name = file_name.split('-')[0]
output_path = "../data/processed/"
output_suffix = "ft_hf_classified"
output_file = f"{output_path}{base_file_name}-{output_suffix}.{file_format}"

print(output_file)

# Para salvar os dados, descomente as linhas abaixo
if file_format == 'csv':
    df.to_csv(output_file, sep=";", encoding="utf-8", index=False)

if file_format == 'parquet':
    df.to_parquet(output_file)

../data/processed/sp_elected_feddep_tweets-ft_hf_classified.parquet
