# EDA1 - Univariate Distribution and Relantionship between Subreddit.name and Sentiment	

## 1 Univariate distribution

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import kaleido
from datetime import datetime
import pytz

In [51]:
df = pd.read_csv('./dataset/convincing_data.csv')

In [3]:
df.columns

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'sentiment', 'score', 'body_cleaned',
       'climate_count', 'change_count', 'body_length', 'climate_proportion',
       'change_proportion'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,sentiment,score,body_cleaned,climate_count,change_count,body_length,climate_proportion,change_proportion
0,comment,imlcpab,2qh1i,askreddit,False,1661990065,https://old.reddit.com/r/AskReddit/comments/x2...,0.469,2,"['need', 'chang', 'law', 'worth', 'sell', 'agr...",2,3,403,0.004963,0.007444
1,comment,imlca5e,4tly53,collapseprep,False,1661989877,https://old.reddit.com/r/CollapsePrep/comments...,-0.8048,9,"['california', 'veri', 'near', 'coast', 'suppo...",2,2,1277,0.001566,0.001566
2,comment,imlc8w9,2tja6,terriblefacebookmemes,False,1661989862,https://old.reddit.com/r/terriblefacebookmemes...,-0.8238,1,"['matter', 'defin', 'real', 'scienc', 'scienc'...",2,2,636,0.003145,0.003145
3,comment,imlc2o0,2qh1i,askreddit,False,1661989785,https://old.reddit.com/r/AskReddit/comments/x2...,0.0,9,"['gt', 'climat', 'chang', 'mind', 'hi', 'degre...",4,2,263,0.015209,0.007605
4,comment,imlc1nx,2v2e5,effectivealtruism,False,1661989773,https://old.reddit.com/r/EffectiveAltruism/com...,-0.8439,4,"['bad', 'scenario', 'climat', 'chang', 'exact'...",3,4,516,0.005814,0.007752


## 1.1 Deal with utc

In [52]:
df = df[['created_utc', 'sentiment', 'score', 'subreddit.name', 'body_length']]

In [6]:
def utc2time(utcint):
    utc_timestamp = utcint

    utc_time = datetime.utcfromtimestamp(utc_timestamp)

    utc_timezone = pytz.utc
    local_timezone = pytz.timezone('Asia/Shanghai')

    local_time = utc_timezone.localize(utc_time).astimezone(local_timezone)
    
    return local_time

df['time'] = df.created_utc.apply(lambda x: utc2time(x))

In [7]:
df['time_value'] = df.time.apply(lambda x: x.year+x.timetuple().tm_yday / (365 + x.is_leap_year))

In [39]:
'''fig = px.histogram(data_frame = df,
                   x="time_value",
                   labels = {'time_value':'year'},
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                   template="plotly_white"
                )
fig.update_layout(title = "Distribution of time" , title_x = 0.5)
fig.show()'''

'fig = px.histogram(data_frame = df,\n                   x="time_value",\n                   labels = {\'time_value\':\'year\'},\n                   color_discrete_sequence =  [\'#58D68D\',\'#DE3163\'],\n                   marginal="box",\n                   nbins= 100,\n                   template="plotly_white"\n                )\nfig.update_layout(title = "Distribution of time" , title_x = 0.5)\nfig.show()'

In [9]:
cm = sns.light_palette("green", as_cmap=True)

df[['time_value']].describe().T.style.format({'count': "{:.0f}", 'mean': "{:.6f}", 'std': "{:.6f}",
                                      'min': "{:.6f}", '25%': "{:.6f}", '50%': "{:.6f}", '75%': "{:.6f}", 'max': "{:.6f}"})\
                                      .background_gradient(cmap=cm, axis=1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
time_value,709237,2019.119175,2.482934,2010.00274,2017.454795,2019.589041,2021.043836,2022.668493


## 1.2 Deal with sentiment

In [27]:
'''fig = px.histogram(data_frame = df,
                   x="sentiment",
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                   template="plotly_white"
                )
fig.update_layout(title = "Distribution of sentiment" , title_x = 0.5)
fig.show()'''

'fig = px.histogram(data_frame = df,\n                   x="sentiment",\n                   color_discrete_sequence =  [\'#58D68D\',\'#DE3163\'],\n                   marginal="box",\n                   nbins= 100,\n                   template="plotly_white"\n                )\nfig.update_layout(title = "Distribution of sentiment" , title_x = 0.5)\nfig.show()'

In [11]:
df[['sentiment']].describe().T.style.format({'count': "{:.0f}", 'mean': "{:.6f}", 'std': "{:.6f}",
                                      'min': "{:.6f}", '25%': "{:.6f}", '50%': "{:.6f}", '75%': "{:.6f}", 'max': "{:.6f}"})\
                                      .background_gradient(cmap=cm, axis=1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sentiment,709237,-0.004377,0.736879,-0.9999,-0.7748,0.0,0.7715,1.0


## 1.3 Deal with score

In [28]:
'''fig = px.histogram(data_frame = df,
                   x="score",
                   log_y = True,
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                   template="plotly_white"
                )
fig.update_layout(title = "Distribution of score" , title_x = 0.5)
fig.show()'''

'fig = px.histogram(data_frame = df,\n                   x="score",\n                   log_y = True,\n                   color_discrete_sequence =  [\'#58D68D\',\'#DE3163\'],\n                   marginal="box",\n                   nbins= 100,\n                   template="plotly_white"\n                )\nfig.update_layout(title = "Distribution of score" , title_x = 0.5)\nfig.show()'

In [13]:
df[['score']].describe().T.style.format({'count': "{:.0f}", 'mean': "{:.1f}", 'std': "{:.1f}",
                                      'min': "{:.0f}", '25%': "{:.0f}", '50%': "{:.0f}", '75%': "{:.0f}", 'max': "{:.0f}"})\
                                      .background_gradient(cmap=cm, axis=1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,709237,8.9,124.0,-518,1,2,5,30304


## 1.4 Deal with subreddit

In [30]:
subr_freq = pd.DataFrame({'subreddit' : df['subreddit.name'].value_counts().index, 
                           'occurrence' : df['subreddit.name'].value_counts().values})

In [37]:
'''fig = px.histogram(data_frame = subr_freq,
                   x="occurrence",
                   log_y = True,
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                   template="plotly_white"
                )
fig.update_layout(title = "Distribution of subreddit occurrence" , title_x = 0.5)
fig.show()'''

'fig = px.histogram(data_frame = subr_freq,\n                   x="occurrence",\n                   log_y = True,\n                   color_discrete_sequence =  [\'#58D68D\',\'#DE3163\'],\n                   marginal="box",\n                   nbins= 100,\n                   template="plotly_white"\n                )\nfig.update_layout(title = "Distribution of subreddit occurrence" , title_x = 0.5)\nfig.show()'

In [32]:
subr_freq[['occurrence']].describe().T.style.format({'count': "{:.0f}", 'mean': "{:.1f}", 'std': "{:.1f}",
                                      'min': "{:.0f}", '25%': "{:.0f}", '50%': "{:.0f}", '75%': "{:.0f}", 'max': "{:.0f}"})\
                                      .background_gradient(cmap=cm, axis=1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
occurrence,10612,66.8,982.0,1,1,2,8,62713


## 1.5 Deal with body length

In [56]:
'''fig = px.histogram(data_frame = df,
                   x="body_length",
                   log_y = True,
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                   template="plotly_white"
                )
fig.update_layout(title = "Distribution of body length" , title_x = 0.5)
fig.show()'''

'fig = px.histogram(data_frame = df,\n                   x="body_length",\n                   log_y = True,\n                   color_discrete_sequence =  [\'#58D68D\',\'#DE3163\'],\n                   marginal="box",\n                   nbins= 100,\n                   template="plotly_white"\n                )\nfig.update_layout(title = "Distribution of body length" , title_x = 0.5)\nfig.show()'

In [54]:
df[['body_length']].describe().T.style.format({'count': "{:.0f}", 'mean': "{:.1f}", 'std': "{:.1f}",
                                      'min': "{:.0f}", '25%': "{:.0f}", '50%': "{:.0f}", '75%': "{:.0f}", 'max': "{:.0f}"})\
                                      .background_gradient(cmap=cm, axis=1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
body_length,709237,1000.1,1124.9,38,346,626,1169,16015


# 2 Bivariate distribution

In [41]:
subr_freq[subr_freq['occurrence'] > 10000].shape[0]

11

In [43]:
target_subreddit = set(subr_freq[subr_freq['occurrence'] > 10000]['subreddit'])

In [44]:
df_new = df[df['subreddit.name'].isin(target_subreddit)]
df_new.reset_index(drop=True, inplace=True)

In [45]:
df_new

Unnamed: 0,created_utc,sentiment,score,subreddit.name,time,time_value
0,1661990065,0.4690,2,askreddit,2022-09-01 07:54:25+08:00,2022.668493
1,1661989785,0.0000,9,askreddit,2022-09-01 07:49:45+08:00,2022.668493
2,1661987214,0.5725,-1,environment,2022-09-01 07:06:54+08:00,2022.668493
3,1661986025,0.8947,15,politics,2022-09-01 06:47:05+08:00,2022.668493
4,1661983557,0.8658,1,futurology,2022-09-01 06:05:57+08:00,2022.668493
...,...,...,...,...,...,...
258552,1262450427,0.5541,-5,science,2010-01-03 00:40:27+08:00,2010.008219
258553,1262398553,-0.8150,-4,askreddit,2010-01-02 10:15:53+08:00,2010.005479
258554,1262383373,0.9366,2,environment,2010-01-02 06:02:53+08:00,2010.005479
258555,1262314480,0.0242,1,environment,2010-01-01 10:54:40+08:00,2010.002740


In [46]:
df_new['subreddit.name'].value_counts()

worldnews          62713
politics           52817
askreddit          28185
futurology         16696
collapse           16127
science            16123
environment        14651
news               13896
climateskeptics    13176
changemyview       12757
canada             11416
Name: subreddit.name, dtype: int64

In [47]:
result = df_new.groupby('subreddit.name')['sentiment'].agg(['mean', 'std'])
result.rename(columns={'subreddit.name':'subreddit', 'mean':'sentiment mean', 'std': 'sentiment std'}, inplace=True)
result.index.rename('subreddit', inplace=True)

result

Unnamed: 0_level_0,sentiment mean,sentiment std
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1
askreddit,-0.049775,0.708427
canada,0.00904,0.714511
changemyview,0.007457,0.784153
climateskeptics,0.026676,0.730553
collapse,-0.170942,0.74723
environment,0.012764,0.735836
futurology,-0.0125,0.73319
news,-0.090419,0.70563
politics,-0.004705,0.725536
science,0.084713,0.696251


In [48]:
result_sort = result.sort_values(by='sentiment mean', ascending=True)
result_sort

Unnamed: 0_level_0,sentiment mean,sentiment std
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1
collapse,-0.170942,0.74723
news,-0.090419,0.70563
askreddit,-0.049775,0.708427
worldnews,-0.041597,0.725188
futurology,-0.0125,0.73319
politics,-0.004705,0.725536
changemyview,0.007457,0.784153
canada,0.00904,0.714511
environment,0.012764,0.735836
climateskeptics,0.026676,0.730553
