In [5]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

plt.style.use('ggplot')
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Carregando os dados
file_path = "../data/processed/sp_elected_stdep_tweets.csv"
df = pd.read_csv(file_path, sep=";", encoding="utf-8")
print(df.shape)
df.head()

(46973, 21)


Unnamed: 0,url,date,content,user,reply_count,retweet_count,like_count,quote_count,in_reply_to_id,in_reply_to_user,...,conversation_user,ru_bert_base_label,ru_bert_base_score,ru_bert_large_label,ru_bert_large_score,ru_mdeberta_base_label,ru_mdeberta_base_score,ci_distilbert_base_label,ci_distilbert_base_score,label_sum
0,https://twitter.com/anaperugini/status/1575873...,2022-09-30 15:42:19+00:00,Na reta final nos encontramos novamente! @Hadd...,anaperugini,5,10,50,1,,,...,anaperugini,False,0.999979,False,0.999991,False,0.999988,False,0.947417,0
1,https://twitter.com/anaperugini/status/1575873...,2022-09-30 15:42:20+00:00,A caminhada será no centro de Hortolândia com ...,anaperugini,1,0,2,0,1.575874e+18,anaperugini,...,anaperugini,False,0.999958,False,0.999988,False,0.999986,False,0.98247,0
2,https://twitter.com/DeAquilini/status/15759437...,2022-09-30 20:20:32+00:00,@anaperugini vou votar em você pela causa do t...,DeAquilini,0,0,0,0,1.575874e+18,anaperugini,...,anaperugini,False,0.999974,False,0.999992,False,0.999993,False,0.979572,0
3,https://twitter.com/CliaMariaCardo1/status/157...,2022-09-30 15:53:12+00:00,@anaperugini @Haddad_Fernando @geraldoalckmin ...,CliaMariaCardo1,0,0,0,0,1.575874e+18,anaperugini,...,anaperugini,False,0.999922,False,0.999749,False,0.999989,False,0.985927,0
4,https://twitter.com/Veracbrgc/status/157588042...,2022-09-30 16:09:06+00:00,@anaperugini @fmvasques53 @Haddad_Fernando @ge...,Veracbrgc,0,0,0,0,1.575874e+18,anaperugini,...,anaperugini,False,0.999964,False,0.999987,False,0.999991,False,0.960931,0


In [7]:
# Diretório onde modelo foi salvo
model_path = "./fine_tuned_model"

# Carregando tokenizer e modelo
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Escolhendo o dispositivo adequado (CPU or GPU) caso haja mais que um disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Criando pipeline customizado de analise de sentimento
# OBS: possível instanciar o modelo sem o tokenizer e device
classifier = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=torch.cuda.current_device()
)

In [8]:
%%time
# Classificando tweets com modelo refinado
col_name = 'content'
results = df[col_name].apply(lambda x: classifier(x))

CPU times: user 5min 12s, sys: 36.9 s, total: 5min 49s
Wall time: 5min 53s


In [None]:
# Separando rotulo e score
df['ft_rc_bert_base_label'] = [result[0]['label'] for result in results]
df['ft_rc_bert_base_score'] = [result[0]['score'] for result in results]

## Exportando dados

In [None]:
output_path = "../data/processed/"
output_file = f"sp_elected_stdep_tweets_all_time-ft_hf_classified.csv"

# Para salvar os dados, descomente as linhas abaixo
df.to_csv(f"{output_path}{output_file}", sep=";", encoding="utf-8", index=False)