In [1]:
!pip install --upgrade pip
!pip install transformers
!pip install torch


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


In [None]:
!pip install spacy torch transformers
#!python -m spacy download en_core_web_sm




In [11]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
import json

In [3]:
df = pd.read_csv(r'/content/df_all_sentiment_tensor.csv')

In [13]:
MODEL_NAME = 'j-hartmann/emotion-english-distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

emotion_recognition = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)


Device set to use cuda:0


In [15]:
# List of coordinating and subordinating conjunctions
conjunctions = [
    "and", "but", "or", "so", "yet", "for", "nor",  # Coordinating
    "because", "although", "since", "if", "when", "while", "unless"  # Subordinating
]

# Regex to split by punctuation and conjunctions
def split_by_punctuation_and_conjunctions(text):
    # Add a space before and after conjunctions to make splitting easier
    for conj in conjunctions:
        text = re.sub(fr"\b{conj}\b", f" {conj} ", text)
    # Split by punctuation or conjunctions without capturing groups
    subsentences = re.split(r'[.,!?;]|\s+(?:and|but|or|so|yet|for|nor|because|although|since|if|when|while|unless)\s+', text)
    # Remove extra spaces and empty items
    return [s.strip() for s in subsentences if s and s.strip()]

# Function to recognize emotions in subsentences
def recognize_emotions(subsentences):
    emotions = []
    for subsentence in subsentences:
        emotion_scores = emotion_recognition(subsentence)
        # Get the emotion with the highest score
        dominant_emotion = max(emotion_scores[0], key=lambda x: x["score"])
        emotions.append({"subsentence": subsentence, "emotion": dominant_emotion["label"], "score": dominant_emotion["score"]})
    return emotions

# Main function
def process_tweet(tweet):
    # Step 1: Chunking by punctuation and conjunctions
    subsentences = split_by_punctuation_and_conjunctions(tweet)

    # Step 2: Emotion recognition
    emotions = recognize_emotions(subsentences)

    # Step 3: Structure the results
    return emotions

# Example tweet
tweet = "I love programming, but sometimes it gets frustrating. However, I always learn something new!"

# Process the tweet
results = process_tweet(tweet)

# Display results
print(json.dumps(results, indent=2))


[
  {
    "subsentence": "I love programming",
    "emotion": "joy",
    "score": 0.9864062666893005
  },
  {
    "subsentence": "sometimes it gets frustrating",
    "emotion": "sadness",
    "score": 0.5059757828712463
  },
  {
    "subsentence": "However",
    "emotion": "neutral",
    "score": 0.7755581736564636
  },
  {
    "subsentence": "I always learn something new",
    "emotion": "neutral",
    "score": 0.7816808223724365
  }
]


In [16]:
tweet = "It has been about two months since Hurricane Helene absolutely devastated vast stretches of North Carolina, Tennessee and other states, and many victims have still not gotten any help from the government at all."
results = process_tweet(tweet)
print(json.dumps(results, indent=2))

[
  {
    "subsentence": "It has been about two months",
    "emotion": "neutral",
    "score": 0.8602077960968018
  },
  {
    "subsentence": "Hurricane Helene absolutely devastated vast stretches of North Carolina",
    "emotion": "sadness",
    "score": 0.4616415202617645
  },
  {
    "subsentence": "Tennessee",
    "emotion": "neutral",
    "score": 0.8273680806159973
  },
  {
    "subsentence": "other states",
    "emotion": "neutral",
    "score": 0.9064851403236389
  },
  {
    "subsentence": "many victims have still not gotten any help from the government at all",
    "emotion": "sadness",
    "score": 0.7088685631752014
  }
]


In [18]:
# Keep only the rows where 'text_clean' is a string
df = df[df['text_clean'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)


In [24]:
df_filtered = df.copy()

In [25]:
from transformers import AutoTokenizer

# Carrega o tokenizer do modelo
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

# Remove as linhas com valores nulos em 'text_clean'
df_filtered = df_filtered.dropna(subset=["text_clean"]).reset_index(drop=True)

# Conta o número de tokens de cada texto
df_filtered["num_tokens"] = df_filtered["text_clean"].apply(lambda x: len(tokenizer.tokenize(x)))

# Filtra o DataFrame para manter apenas os textos com até 512 tokens
df_filtered = df_filtered[df_filtered["num_tokens"] <= 512].reset_index(drop=True)

# Remove a coluna auxiliar
df_filtered.drop(columns=["num_tokens"], inplace=True)

# Mostra quantas linhas restaram
print(f"Número de linhas após filtragem: {df_filtered.shape[0]}")


Número de linhas após filtragem: 94596


In [26]:
df_filtered['classification'].value_counts()

Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
reliable news,49536
fake news,45060


In [None]:
# Removendo números da coluna 'texto'
df_filtered['text_clean'] = df_filtered['text_clean'].str.replace(r'\d+', '', regex=True)

In [27]:
df["emotion_recognition"] = df["text_clean"].apply(process_tweet)

In [28]:
from google.colab import files

# Salvar o DataFrame como um arquivo CSV
df_filtered.to_csv('df_all_emoticon_seq.csv', index=False)

# Baixar o arquivo
files.download('df_all_emoticon_seq.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>