## Imports

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import transformers
from transformers import pipeline

In [36]:
import nltk
from nltk.corpus import stopwords
# Baixar as stop words do nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vieir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
import demoji
import emoji
demoji.download_codes()

  demoji.download_codes()


In [38]:
pd.set_option('display.max_columns', None)  # Para mostrar todas as colunas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)  # Para permitir que as colunas sejam exibidas sem corte

In [39]:
df_fake = pd.read_csv(r'C:\Users\vieir\Mestrado\TESE\tweets\analises\df_fake_news_2.csv')


In [40]:
df_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50975 entries, 0 to 50974
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tweet_id           50975 non-null  object 
 1   text               50975 non-null  object 
 2   language           50966 non-null  object 
 3   type               50975 non-null  object 
 4   bookmark_count     50966 non-null  float64
 5   favorite_count     50966 non-null  float64
 6   retweet_count      50966 non-null  float64
 7   reply_count        50966 non-null  float64
 8   view_count         45577 non-null  float64
 9   created_at         50975 non-null  object 
 10  client             50966 non-null  object 
 11  hashtags           6081 non-null   object 
 12  urls               22380 non-null  object 
 13  media_type         21738 non-null  object 
 14  media_urls         21738 non-null  object 
 15  user_id            50975 non-null  object 
 16  name               509

## Functions

In [41]:
def emoji_count(tweet):
    tweet = emoji.demojize(tweet, delimiters=('__','__'))
    pattern = r'_+[a-z_&]+_+'
    return len(re.findall(pattern, tweet))

In [42]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links and special characters
    (except punctuation, apostrophes, and monetary symbols) using simple regex statements.
    '''
    return ' '.join(re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t\U0001F600-\U0001F64F.,!?':;’$€£])|(\w+:\/\/\S+)", " ", tweet).split())


In [43]:
def clean_text(text):
    # Remover stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

In [44]:
def classify_sentiment_textblob(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [45]:
def classify_sentiment_vader(polarity_scores):
    '''
    Classify sentiment based on the compound score from vaderSentiment polarity scores.
    '''
    compound = polarity_scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

## Pipeline test

### Text Preprocessing

In [46]:
df_fake['emoji_count'] = df_fake['text'].apply(emoji_count)

In [47]:
df_fake[df_fake['emoji_count']!=0].head(2)

Unnamed: 0,tweet_id,text,language,type,bookmark_count,favorite_count,retweet_count,reply_count,view_count,created_at,client,hashtags,urls,media_type,media_urls,user_id,name,n tweets,n followers,category,Factual Reporting,user,emoji_count
168,'1852736426816794736',"🔥 Live Now on CIVL: Art of Liberty Foundation’s FREE Livestream 🔥\nGet ready to experience a seismic shift in how we view society, governance, and personal freedom. Thanks to a powerful sponsorship, this revolutionary livestream is free!\nTune in: https://t.co/iJCscu5zJT https://t.co/atdeGa9Oer",en,Tweet,0.0,0.0,0.0,0.0,215.0,2024-11-02 15:36:00,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://watch.civl.com/programs/liberty-on-the-rocks-2024-digital-pass,photo,https://pbs.twimg.com/media/GbY90Rsa8AArRjk.jpg,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,2
177,'1852114998023057555',"👀 Ready to see what true liberty looks like? The Art of Liberty Foundation’s livestream is live today, free on CiVL! Experience game-changing ideas on society, governance, and freedom—no charge, no filters. \n\n🌐 Tap in: \n\nhttps://t.co/iJCscu5zJT https://t.co/PYN81JyTBW",en,Tweet,0.0,1.0,1.0,0.0,201.0,2024-10-31 22:26:40,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://watch.civl.com/programs/liberty-on-the-rocks-2024-digital-pass,photo,https://pbs.twimg.com/media/GbQIxEvXgAAavLl.jpg,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,2


In [48]:
count_zero = (df_fake['emoji_count'] == 0).sum()
count_greater_than_zero = (df_fake['emoji_count'] > 0).sum()

summary = pd.DataFrame({
    'Condition': ['emoji_count == 0', 'emoji_count > 0'],
    'Count': [count_zero, count_greater_than_zero]
})

summary

Unnamed: 0,Condition,Count
0,emoji_count == 0,44247
1,emoji_count > 0,6728


In [49]:
df_clean = df_fake.copy()
df_clean['text'] = df_clean['text'].apply(clean_tweet)
df_clean.head(2)

Unnamed: 0,tweet_id,text,language,type,bookmark_count,favorite_count,retweet_count,reply_count,view_count,created_at,client,hashtags,urls,media_type,media_urls,user_id,name,n tweets,n followers,category,Factual Reporting,user,emoji_count
0,'1861174452215529687',How do you express gratitude for one’s safety when the perils posed by the American police state grow more treacherous by the day?,en,Tweet,1.0,3.0,2.0,0.0,123.0,2024-11-25 22:25:43,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://activistpost.com/2024/11/divide-and-conquer-political-riptides-threaten-to-overwhelm-the-nation.html,,,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,0
1,'1861172568218071498',We generally think of gold as a safe haven asset. But what about silver?,en,Tweet,1.0,0.0,0.0,0.0,125.0,2024-11-25 22:18:13,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://activistpost.com/2024/11/is-silver-a-safe-haven-asset.html,,,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,0


In [50]:
df_clean['text_clean'] = df_clean['text'].apply(clean_text)

In [51]:
df_clean['emoji_count'] = df_clean['text'].apply(emoji_count)

### Sentiment Classification

#### using textblob

In [52]:
df_clean['sentiment_textblob'] = df_clean['text'].apply(lambda x: classify_sentiment_textblob(x))

#### using vader

In [53]:
sentiment = SentimentIntensityAnalyzer()


In [54]:
df_clean['sentiment_vader'] = df_clean['text'].apply(lambda x: classify_sentiment_vader(sentiment.polarity_scores(x)))


In [55]:
df_clean.head(2)

Unnamed: 0,tweet_id,text,language,type,bookmark_count,favorite_count,retweet_count,reply_count,view_count,created_at,client,hashtags,urls,media_type,media_urls,user_id,name,n tweets,n followers,category,Factual Reporting,user,emoji_count,text_clean,sentiment_textblob,sentiment_vader
0,'1861174452215529687',How do you express gratitude for one’s safety when the perils posed by the American police state grow more treacherous by the day?,en,Tweet,1.0,3.0,2.0,0.0,123.0,2024-11-25 22:25:43,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://activistpost.com/2024/11/divide-and-conquer-political-riptides-threaten-to-overwhelm-the-nation.html,,,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,0,express gratitude one’s safety perils posed American police state grow treacherous day?,positive,positive
1,'1861172568218071498',We generally think of gold as a safe haven asset. But what about silver?,en,Tweet,1.0,0.0,0.0,0.0,125.0,2024-11-25 22:18:13,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",,https://activistpost.com/2024/11/is-silver-a-safe-haven-asset.html,,,ActivistPost,Activist Post,1000.0,41070.0,Pseudoscience,low,ActivistPost,0,generally think gold safe asset. silver?,positive,positive


In [24]:
df_clean.to_csv(r'C:\Users\vieir\Mestrado\TESE\tweets\analises\df_fake_news_sentiment.csv', index=False)

#### using Transformer based models

 go to the other notebook :)