In [28]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
df = pd.read_excel('data.xlsx')
df.head(5)

In [None]:
import pysentimiento

In [None]:
classifier = pysentimiento.create_analyzer(task='sentiment', lang='pt')

In [None]:
df['sentiment'] = df['comment'].apply(lambda x: classifier.predict(x).output)
df.head(5)

In [None]:
df.head(5).to_clipboard(index=False)

In [None]:
df.to_excel('sentiment.xlsx', index=False)

In [2]:
df = pd.read_excel('sentiment.xlsx')

In [23]:
distinct_posts = df.groupby('candidate')['post_url'].nunique()
distinct_comments = df.groupby(['candidate','sentiment'])['comment'].nunique()


sum_likes = df.groupby(['candidate', 'sentiment'])['likes'].sum().reset_index()
sum_likes

Unnamed: 0,candidate,sentiment,likes
0,Guilherme Boulos,NEG,20341
1,Guilherme Boulos,NEU,56406
2,Guilherme Boulos,POS,57816
3,Kim Kataguiri,NEG,71500
4,Kim Kataguiri,NEU,73787
5,Kim Kataguiri,POS,23616
6,Marina Helena,NEG,12479
7,Marina Helena,NEU,6720
8,Marina Helena,POS,9140
9,Ricardo Nunes,NEG,4438


In [24]:
sum_likes = sum_likes.merge(distinct_comments, on=['candidate', 'sentiment'])
sum_likes

Unnamed: 0,candidate,sentiment,likes,comment
0,Guilherme Boulos,NEG,20341,695
1,Guilherme Boulos,NEU,56406,1200
2,Guilherme Boulos,POS,57816,2565
3,Kim Kataguiri,NEG,71500,1525
4,Kim Kataguiri,NEU,73787,1130
5,Kim Kataguiri,POS,23616,592
6,Marina Helena,NEG,12479,1473
7,Marina Helena,NEU,6720,1060
8,Marina Helena,POS,9140,1327
9,Ricardo Nunes,NEG,4438,561


In [32]:
sum_likes['engagement_score'] = (sum_likes['likes'] * 0.3) * (sum_likes['comment'] * 0.7)
sum_likes


Unnamed: 0,candidate,sentiment,likes,comment,engagement_score,engagement_score_normalized
0,Guilherme Boulos,NEG,20341,695,2968768.95,0.079881
1,Guilherme Boulos,NEU,56406,1200,14214312.0,0.447145
2,Guilherme Boulos,POS,57816,2565,31142588.4,1.0
3,Kim Kataguiri,NEG,71500,1525,22897875.0,0.730739
4,Kim Kataguiri,NEU,73787,1130,17509655.1,0.554767
5,Kim Kataguiri,POS,23616,592,2935941.12,0.078809
6,Marina Helena,NEG,12479,1473,3860129.07,0.108991
7,Marina Helena,NEU,6720,1060,1495872.0,0.031778
8,Marina Helena,POS,9140,1327,2547043.8,0.066108
9,Ricardo Nunes,NEG,4438,561,522840.78,0.0


In [34]:
engagement_score_sum = sum_likes.engagement_score.sum()
engagement_score_sum


127660235.84999996

In [35]:
sum_likes['engagement_score_normalized'] = sum_likes['engagement_score'] / engagement_score_sum
sum_likes

Unnamed: 0,candidate,sentiment,likes,comment,engagement_score,engagement_score_normalized
0,Guilherme Boulos,NEG,20341,695,2968768.95,0.023255
1,Guilherme Boulos,NEU,56406,1200,14214312.0,0.111345
2,Guilherme Boulos,POS,57816,2565,31142588.4,0.243949
3,Kim Kataguiri,NEG,71500,1525,22897875.0,0.179366
4,Kim Kataguiri,NEU,73787,1130,17509655.1,0.137158
5,Kim Kataguiri,POS,23616,592,2935941.12,0.022998
6,Marina Helena,NEG,12479,1473,3860129.07,0.030238
7,Marina Helena,NEU,6720,1060,1495872.0,0.011718
8,Marina Helena,POS,9140,1327,2547043.8,0.019952
9,Ricardo Nunes,NEG,4438,561,522840.78,0.004096


In [89]:
fig = px.bar(sum_likes, 
             x='comment', 
             y='candidate', 
             orientation='h',
             color='sentiment',
             color_discrete_map={'POS': 'green', 'NEG': 'red', 'NEU': 'blue'},
             labels={'comment': 'Number of Comments', 'candidate': 'Candidate'},
             title='Comment Sentiment per Candidate')

fig.show()

In [90]:
fig = px.bar(sum_likes, 
             x='likes', 
             y='candidate', 
             orientation='h',
             color='sentiment',
             color_discrete_map={'POS': 'green', 'NEG': 'red', 'NEU': 'blue'},
             labels={'comment': 'Number of Comments', 'candidate': 'Candidate'},
             title='Like Sentiment per Candidate')

fig.show()

In [74]:
sum_likes_filtered = sum_likes[sum_likes['sentiment'] != 'NEU']

total_engagement = sum_likes_filtered.groupby('candidate')['engagement_score'].sum()

total_engagement_percentage = total_engagement / total_engagement.sum() * 100


In [59]:
sum_likes_filtered['engagement_score_normalized'] = sum_likes_filtered['engagement_score'] / sum_likes_filtered['engagement_score'].sum() * 100
sum_likes_filtered



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,candidate,sentiment,likes,comment,engagement_score,engagement_score_normalized
0,Guilherme Boulos,NEG,20341,695,2968768.95,3.363066
2,Guilherme Boulos,POS,57816,2565,31142588.4,35.278789
3,Kim Kataguiri,NEG,71500,1525,22897875.0,25.939054
5,Kim Kataguiri,POS,23616,592,2935941.12,3.325878
6,Marina Helena,NEG,12479,1473,3860129.07,4.372812
8,Marina Helena,POS,9140,1327,2547043.8,2.885329
9,Ricardo Nunes,NEG,4438,561,522840.78,0.592282
11,Ricardo Nunes,POS,6213,2467,3218768.91,3.646269
12,Tabata Amaral,NEG,23144,1036,5035208.64,5.703959
14,Tabata Amaral,POS,31506,1987,13146508.62,14.892561


In [91]:
fig = px.bar(sum_likes_filtered, 
             x='candidate', 
             y='engagement_score_normalized', 
             color='sentiment',
             color_discrete_map={'POS': 'green', 'NEG': 'red'},
             
             labels={'engagement_score_normalized': 'Total % of Engagement', 'candidate': 'Candidate'},
             title='Engagement Percentage per Candidate wit Sentiment Breakdown')

fig.show()

In [37]:
total_engagement_percentage_df = total_engagement_percentage.reset_index()
total_engagement_percentage_df.columns = ['candidate', 'engagement_percentage']

# Plotting with plotly
fig = px.bar(total_engagement_percentage_df, 
             x='engagement_percentage', 
             y='candidate', 
             orientation='h',
             labels={'engagement_percentage': 'Total % of Engagement', 'candidate': 'Candidate'},
             title='Engagement Percentage per Candidate')

fig.show()

In [55]:
def multiply_score(row):
    if row['sentiment'] == 'NEG':
        return row['engagement_score_normalized'] * -1
    else:
        return row['engagement_score_normalized']

In [75]:
sum_likes_filtered['engagement_score_normalized'] = sum_likes_filtered.apply(lambda x: round(x['engagement_score_normalized'] * -1 * 100, 2) if x['sentiment'] == 'NEG' else round(x['engagement_score_normalized'] * 100, 2), axis=1)

sum_likes_filtered



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,candidate,sentiment,likes,comment,engagement_score,engagement_score_normalized
0,Guilherme Boulos,NEG,20341,695,2968768.95,-2.33
2,Guilherme Boulos,POS,57816,2565,31142588.4,24.39
3,Kim Kataguiri,NEG,71500,1525,22897875.0,-17.94
5,Kim Kataguiri,POS,23616,592,2935941.12,2.3
6,Marina Helena,NEG,12479,1473,3860129.07,-3.02
8,Marina Helena,POS,9140,1327,2547043.8,2.0
9,Ricardo Nunes,NEG,4438,561,522840.78,-0.41
11,Ricardo Nunes,POS,6213,2467,3218768.91,2.52
12,Tabata Amaral,NEG,23144,1036,5035208.64,-3.94
14,Tabata Amaral,POS,31506,1987,13146508.62,10.3


In [77]:


# Define color mapping
color_map = {'NEG': 'red', 'POS': 'green'}

# Plotting with Plotly
fig = px.bar(sum_likes_filtered, 
             x='candidate', 
             y='engagement_score_normalized', 
             color='sentiment',
             color_discrete_map=color_map,  # Apply custom color mapping
             labels={'engagement_score_normalized': 'Score (%)', 'candidate': 'Candidates'},  # Rename x and y axis labels
             title='Positive/Negative Scores by Candidate')

#fig.update_yaxes(range=[-1.1, 1.1])  # Ensure y-axis range includes -1 and 1 for clarity

fig.show()

In [79]:
sum_likes_filtered.to_excel('sentiment_score.xlsx', index=False)

In [80]:
sum_likes_filtered = pd.read_excel('sentiment_score.xlsx')

In [83]:
# Group by candidate and calculate descriptive statistics
stats_by_candidate = sum_likes_filtered.groupby('candidate')['engagement_score_normalized'].describe()

# Calculate variance by candidate separately
variance_by_candidate = sum_likes_filtered.groupby('candidate')['engagement_score_normalized'].var()

print("Descriptive Statistics for engagement_score_normalized by Candidate:")
print(stats_by_candidate)
print("\nVariance by Candidate:")
print(variance_by_candidate)

Descriptive Statistics for engagement_score_normalized by Candidate:
                  count    mean        std    min      25%     50%      75%  \
candidate                                                                     
Guilherme Boulos    2.0  11.030  18.893893  -2.33   4.3500  11.030  17.7100   
Kim Kataguiri       2.0  -7.820  14.311841 -17.94 -12.8800  -7.820  -2.7600   
Marina Helena       2.0  -0.510   3.549676  -3.02  -1.7650  -0.510   0.7450   
Ricardo Nunes       2.0   1.055   2.071823  -0.41   0.3225   1.055   1.7875   
Tabata Amaral       2.0   3.180  10.069201  -3.94  -0.3800   3.180   6.7400   

                    max  
candidate                
Guilherme Boulos  24.39  
Kim Kataguiri      2.30  
Marina Helena      2.00  
Ricardo Nunes      2.52  
Tabata Amaral     10.30  

Variance by Candidate:
candidate
Guilherme Boulos    356.97920
Kim Kataguiri       204.82880
Marina Helena        12.60020
Ricardo Nunes         4.29245
Tabata Amaral       101.38880
Name: engag

In [95]:
datafolha = pd.DataFrame({
    'candidates': ['Guilherme Boulos',
                   'Ricardo Nunes',
                   'Tabata Amaral',
                   'Marina Helena',
                   'Kim Kataguiri',
                   'Outros'
                   ],
    'datafolha_intention': [31, 19, 6, 1, 8, 35]
    
})

In [111]:
df_comp = pd.merge(datafolha, sum_likes_filtered, left_on='candidates', right_on='candidate')


In [113]:
df_comp[df_comp['sentiment'] == 'POS']

Unnamed: 0,candidates,datafolha_intention,candidate,sentiment,likes,comment,engagement_score,engagement_score_normalized
1,Guilherme Boulos,31,Guilherme Boulos,POS,57816,2565,31142588.4,24.39
3,Ricardo Nunes,19,Ricardo Nunes,POS,6213,2467,3218768.91,2.52
5,Tabata Amaral,6,Tabata Amaral,POS,31506,1987,13146508.62,10.3
7,Marina Helena,1,Marina Helena,POS,9140,1327,2547043.8,2.0
9,Kim Kataguiri,8,Kim Kataguiri,POS,23616,592,2935941.12,2.3


In [115]:
df_comp['engagement_score_normalized'] = df_comp['engagement_score_normalized'].astype(int)

In [116]:
df_comp.rename(columns={'engagement_score_normalized': 'Positive Engagement Social Media', 'datafolha_intention' : 'Datafolha Source'}, inplace=True)

In [119]:
df_comp = df_comp[df_comp['sentiment'] == 'POS']

In [125]:
df_comp

Unnamed: 0,candidates,Datafolha Source,candidate,sentiment,likes,comment,engagement_score,Positive Engagement Social Media
1,Guilherme Boulos,31,Guilherme Boulos,POS,57816,2565,31142588.4,24
3,Ricardo Nunes,19,Ricardo Nunes,POS,6213,2467,3218768.91,2
5,Tabata Amaral,6,Tabata Amaral,POS,31506,1987,13146508.62,10
7,Marina Helena,1,Marina Helena,POS,9140,1327,2547043.8,2
9,Kim Kataguiri,8,Kim Kataguiri,POS,23616,592,2935941.12,2


In [138]:
fig = px.bar(df_comp,
             x='candidates',
             y=['Datafolha Source', 'Positive Engagement Social Media'],
             barmode='group',  # Grouped bar chart
             labels={'value': 'Value', 'variable': 'Metric', 'candidates': 'Candidates'},  # Rename labels
             title='Datafolha Source vs Positive Engagement Score by Candidate')

fig.update_layout(yaxis_title='Percentage (%)')  # Customizing y-axis title

fig.show()