In [1]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, BertForSequenceClassification
import numpy as np

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,sentence,year,link,source
0,Veja o exemplo das urnas eletrônicas,2023,https://www1.folha.uol.com.br/mercado/2023/04/...,Folha de São Paulo
1,"Pela segunda vez na história, os acadêmicos v...",2023,https://www1.folha.uol.com.br/ilustrada/2023/0...,Folha de São Paulo
2,Conforme antecipado por sua defesa nesta quar...,2023,https://www1.folha.uol.com.br/poder/2023/04/bo...,Folha de São Paulo
3,"Bolsonaro disse ter postado por engano, sob e...",2023,https://www1.folha.uol.com.br/opiniao/2023/04/...,Folha de São Paulo
4,"Não apagará o fato, no entanto, de que o punh...",2023,https://www1.folha.uol.com.br/opiniao/2023/04/...,Folha de São Paulo


In [3]:
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

def sentiment_cardiff(sentence):
    try:
        sentiment = sentiment_task(sentence)[0]['label']
    except RuntimeError:
        sentiment = None
    return sentiment

    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.0.1+cpu)
    Python  3.10.11 (you have 3.10.9)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [4]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    top_k=None
)

def sentiment_distilbert(sentence):
    try:
        sentiment = distilled_student_sentiment_classifier(sentence)[0][0]['label']
    except RuntimeError:
        sentiment = None
    return sentiment

In [5]:
pred_mapper = {
    0: "positive",
    1: "negative",
    2: "neutral"
  }

tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbertptbr = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

def sentiment_finbert(sentence):
    try:
        tokens = tokenizer(sentence, return_tensors="pt",
                    padding=True, truncation=True, max_length=512)
        finbertptbr_outputs = finbertptbr(**tokens)
        preds = [pred_mapper[np.argmax(pred)] for pred in finbertptbr_outputs.logits.cpu().detach().numpy()]
        sentiment = preds[0]
    except RuntimeError:
        sentiment = None
    return sentiment

In [6]:
model_path = "citizenlab/twitter-xlm-roberta-base-sentiment-finetunned"

sentiment_classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

def sentiment_citizenlab(sentence):
    try:
        sentiment = sentiment_classifier(sentence)[0]['label']
    except RuntimeError:
        sentiment = None
    return sentiment

In [7]:
df['cardiff'] = df['sentence'].apply(sentiment_cardiff)
df['distilbert'] = df['sentence'].apply(sentiment_distilbert)
df['finbert'] = df['sentence'].apply(sentiment_finbert)
df['citizenlab'] = df['sentence'].apply(sentiment_citizenlab)

Token indices sequence length is longer than the specified maximum sequence length for this model (4554 > 512). Running this sequence through the model will result in indexing errors


In [8]:
df.to_csv('sentiments.csv', index = False)