In [1]:
import pandas as pd
from itertools import chain
from collections import Counter

from urllib.parse import urlparse
import re
import numpy as np
import requests
import matplotlib.pyplot as plt
from matplotlib import pyplot
from matplotlib.pyplot import figure
import math

import seaborn as sns

from scipy.stats import shapiro 
from scipy.stats import lognorm
from scipy.stats import mannwhitneyu

import scipy.stats as stats

import pymannkendall
from statsmodels.tsa.stattools import grangercausalitytests

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss

import statsmodels.api as sm

import statsmodels.formula.api as smf

In [3]:
#Load the data
##############################################


fb=pd.read_json("fb.json")

#upload the network and edgelist data depending on the time interval chosen (14 / 27 / 54 min)
g = pd.read_csv("conversations_graph_27min.csv", names = ["convid", "n_nodes", "n_edges", "ideg", "odeg", "deg", "avg_clust", "recip", "dens"])
g.convid = g.convid.astype("str")

e = pd.read_csv ("conversations_edges_27min.csv", names = ["convid", "sender", "recepient", "timestamp"])
e

el = pd.read_csv ("conversation_edgelist27min.csv")


In [4]:
#Fix conversation ids and dates
################################################

fb['delta'] = (fb['date']-fb['date'].shift()).fillna(pd.Timedelta('0 days'))
fb['delta'] = fb['delta'].dt.total_seconds() / 60
fb.delta = fb.delta.astype(int)

fb["convid"] = fb["convid"].astype(str)
fb.convid = fb.convid.apply(lambda x: re.sub("\\.0", "", x))
convids = fb.convid.unique().tolist()
convids.remove("nan")

fb = fb.assign (date24h = fb["date"].dt.floor("24h")).reset_index (drop = True)

e.timestamp = pd.to_datetime(e.timestamp, format="%Y-%m-%d %H:%M")
e.timestamp

el.time_sender = pd.to_datetime(el.time_sender, format="%Y-%m-%d %H:%M")
el.time_recepient = pd.to_datetime(el.time_recepient, format="%Y-%m-%d %H:%M")
el  = el.merge(e[["convid", "sender", "recepient"]], left_on = ["sender", "recepient"], right_on = ["sender", "recepient"])

# SUMMARY STATS

In [5]:
#Create CONVERSATION statistics (summary df)
###############################################
fb_conv = fb[fb.convid != "nan"].groupby("convid").agg({"num_am": 'sum', "num_mm": 'sum', "num_all": 'sum', 
                                                            "vader_score": 'mean', 'stance_label':
                                                            'count'}).rename (columns ={"stance_label": "num_posts"})

#links
fb_conv['am_density'] = fb_conv.num_am.div(fb_conv ["num_posts"])
fb_conv['mm_density'] = fb_conv.num_mm.div(fb_conv["num_posts"])
fb_conv['all_density'] = fb_conv.num_all.div(fb_conv ["num_posts"])

fb_conv['am_to_all'] = fb_conv.num_am.div(fb_conv ["num_all"])
fb_conv['mm_to_all'] = fb_conv.num_mm.div(fb_conv ["num_all"])

#stances

threads_neg = fb[(fb.convid != "nan")&(fb.stance_label == "neg")].groupby ("convid").agg({'stance_label' : 'count',
                                                                                              'stance_prob' : 'mean'})
threads_neg = threads_neg.rename (columns={"stance_label": "stance_label_neg", "stance_prob": "mean_stance_prob_neg"})


#POSITIVE STANCES

threads_nonneg = fb[(fb.convid != "nan")&(fb.stance_label == "nonneg")].groupby ("convid").agg({'stance_label' : 
                                                                                                    'count', 
                                                                                                    'stance_prob' : 
                                                                                                    'mean'})
threads_nonneg = threads_nonneg.rename (columns={"stance_label": "stance_label_nonneg", "stance_prob": 
                                                 "mean_stance_prob_nonneg"})

#
fb_conv = pd.concat([fb_conv, threads_neg, threads_nonneg], axis = 1)

fb_conv['neg_label_ratio'] = fb_conv.stance_label_neg.div(fb_conv ["num_posts"])
fb_conv['nonneg_label_ratio'] = fb_conv.stance_label_nonneg.div(fb_conv ["num_posts"])

fb_conv = fb_conv.assign (convid = fb_conv.index).reset_index (drop = True)


#conversation durations
diffdf = (fb[fb.convid != "nan"].groupby(['convid'])['date']
         .agg(lambda x: x.iat[-1] - x.iat[0])
         .reset_index(name='diff'))

el_sumdiff = el[["convid", "time_diff"]].groupby ("convid", as_index=False).median("time_diff")
el_sumdiff.convid = el_sumdiff.convid.astype(str)
el_sumdiff.convid = el_sumdiff.convid.apply(lambda x: re.sub("\\.0", "", x))


#fb_conv = pd.concat([fb_conv, diffdf], axis = 1)
fb_conv = fb_conv.merge(diffdf, right_on = "convid", left_on = "convid")


fb_conv["diff_hours"] = fb_conv["diff"]/ np.timedelta64(1, 'h')

fb_conv = fb_conv.merge(el_sumdiff, left_on = "convid", right_on = "convid")

#add network metrics

fb_conv = fb_conv.merge(g, left_on = "convid", right_on = "convid")

print(fb_conv)

      num_am  num_mm  num_all  vader_score  num_posts  am_density  mm_density  \
0          2      10       25    -0.184269        146    0.013699    0.068493   
1          0       0        1     0.107300          3    0.000000    0.000000   
2          0       0        3    -0.557833          3    0.000000    0.000000   
3          0       0        0    -0.299150          2    0.000000    0.000000   
4          0       1        1     0.111300          4    0.000000    0.250000   
...      ...     ...      ...          ...        ...         ...         ...   
9299       0       0        1    -0.957050          2    0.000000    0.000000   
9300       0       2        2     0.425950          2    0.000000    1.000000   
9301       0       0        0    -0.577550          2    0.000000    0.000000   
9302       0       0        6     0.338025          4    0.000000    0.000000   
9303       0       1        1     0.013333          3    0.000000    0.333333   

      all_density  am_to_al

In [9]:
print(np.mean(fb_conv.num_posts))
print(np.median(fb_conv.num_posts))


28.48194325021496
5.0


In [8]:
fb_conv.to_excel("fb_conv_27.xlsx")

# Mann-Whitney U test - LINKS

In [10]:
# AM VERSUS MM CONVERSATIONS
######################################################

am_conv = fb_conv[(fb_conv.am_density>0.2) & (fb_conv.mm_density == 0)] #other values tested: 0.2, 0.4, 0.6
print(len(am_conv))
mm_conv = fb_conv[(fb_conv.mm_density>0.2) & (fb_conv.am_density == 0)] #other values tested: 0.2, 0.4, 0.6
print(len(mm_conv))

#conversations with am links are longer?
mw_hours = mannwhitneyu(x=am_conv.diff_hours, y=mm_conv.diff_hours, alternative='greater') #‘greater’: the distribution underlying x is stochastically greater than the distribution underlying y, i.e. F(u) < G(u) for all u.
print(mw_hours)

#conversations with am links have higher sentiments? +++
mw_vader = mannwhitneyu(x=am_conv.vader_score, y=mm_conv.vader_score, alternative='greater') 
print(mw_vader)

#conversations with am links have higher negative stance probability? 
mw_negprob = mannwhitneyu(x=am_conv.mean_stance_prob_neg.dropna(), y=mm_conv.mean_stance_prob_neg.dropna(), alternative='greater') 
print(mw_negprob)

#conversations with am links have lower non-negative stance probability? 
mw_nonnegprob = mannwhitneyu(x=am_conv.mean_stance_prob_nonneg.dropna(), y=mm_conv.mean_stance_prob_nonneg.dropna(), alternative='less') 
print(mw_nonnegprob)

#conversations with am links have higher negative stance ratio? ++++
mw_negprob = mannwhitneyu(x=am_conv.neg_label_ratio.dropna(), y=mm_conv.neg_label_ratio.dropna(), alternative='greater') 
print(mw_negprob)

#conversations with am links have lower non-negative stance ratio?
mw_nonnegprob = mannwhitneyu(x=am_conv.nonneg_label_ratio.dropna(), y=mm_conv.nonneg_label_ratio.dropna(), alternative='less') 
print(mw_nonnegprob)

#conversations with am links have lower clustering?
mw_clust = mannwhitneyu(x=am_conv.avg_clust, y=mm_conv.avg_clust, alternative='less') 
print(mw_clust)

#conversations with am links have lower num nodes?
mw_nodes = mannwhitneyu(x=am_conv.n_nodes, y=mm_conv.n_nodes, alternative='less') 
print(mw_nodes)

#conversations with am links have lower num edges?
mw_edges = mannwhitneyu(x=am_conv.n_edges, y=mm_conv.n_edges, alternative='less') 
print(mw_edges)

#conversations with am links have higher density?
mw_dens = mannwhitneyu(x=am_conv.dens, y=mm_conv.dens, alternative='greater') 
print(mw_dens)


161
975
MannwhitneyuResult(statistic=75755.5, pvalue=0.7606930478830192)
MannwhitneyuResult(statistic=79206.0, pvalue=0.4261540148875254)
MannwhitneyuResult(statistic=63238.0, pvalue=0.5506410874185431)
MannwhitneyuResult(statistic=30710.0, pvalue=0.024500498283528727)
MannwhitneyuResult(statistic=75199.0, pvalue=0.00020339711072298082)
MannwhitneyuResult(statistic=31310.0, pvalue=0.043901323980415824)
MannwhitneyuResult(statistic=74782.5, pvalue=0.15541855143956934)
MannwhitneyuResult(statistic=68875.5, pvalue=0.00582999734584923)
MannwhitneyuResult(statistic=69120.5, pvalue=0.007070565817020889)
MannwhitneyuResult(statistic=86744.5, pvalue=0.014242819167947167)


In [11]:
# CONVERSATIONS WITHOUT AND WITH ANY TYPE OF LINKS
#######################################################

all_conv = fb_conv[fb_conv.all_density>0.6] #other values tested: 0.2, 0.4, 0.6

none_conv = fb_conv[fb_conv.all_density==0]
#none_conv = none_conv[0:300]    

print(len(all_conv))
print(len(none_conv))

#conversations with any links are longer? ++++++
mw_hours = mannwhitneyu(x=all_conv.diff_hours, y=none_conv.diff_hours, alternative='greater') #‘greater’: the distribution underlying x is stochastically greater than the distribution underlying y, i.e. F(u) < G(u) for all u.
print(mw_hours)

#conversations with any links have lower sentiments? 
mw_vader = mannwhitneyu(x=all_conv.vader_score, y=none_conv.vader_score, alternative='less') 
print(mw_vader)

#conversations with any links have higher negative stance probability?  +++++
mw_negprob = mannwhitneyu(x=all_conv.mean_stance_prob_neg.dropna(), y=none_conv.mean_stance_prob_neg.dropna(), alternative='less') 
print(mw_negprob)

#conversations with any links have lower non-negative stance probability? +++++ 
mw_nonnegprob = mannwhitneyu(x=all_conv.mean_stance_prob_nonneg.dropna(), y=none_conv.mean_stance_prob_nonneg.dropna(), alternative='greater') 
print(mw_nonnegprob)

#conversations with any links have lower negative negative stance ratio? +++++
mw_negprob = mannwhitneyu(x=all_conv.neg_label_ratio.dropna(), y=none_conv.neg_label_ratio.dropna(), alternative='less') 
print(mw_negprob)

#conversations with any links have higher non-negative stance ratio? +++++
mw_nonnegprob = mannwhitneyu(x=all_conv.nonneg_label_ratio.dropna(), y=none_conv.nonneg_label_ratio.dropna(), alternative='greater') 
print(mw_nonnegprob)

#conversations with any links have higher clustering? 
mw_clust = mannwhitneyu(x=all_conv.avg_clust, y=none_conv.avg_clust, alternative='greater') 
print(mw_clust)

#conversations with any links have lhigher num nodes? +++++++
mw_nodes = mannwhitneyu(x=all_conv.n_nodes, y=none_conv.n_nodes, alternative='greater') 
print(mw_nodes)

#conversations with any links have higher num edges? +++++++++
mw_edges = mannwhitneyu(x=all_conv.n_edges, y=none_conv.n_edges, alternative='greater') 
print(mw_edges) 

#conversations with am links have higher density? ++++++
mw_dens = mannwhitneyu(x=all_conv.dens, y=none_conv.dens, alternative='less') 
print(mw_dens)


849
4602
MannwhitneyuResult(statistic=2357484.0, pvalue=4.505820100049835e-22)
MannwhitneyuResult(statistic=1933359.0, pvalue=0.3158998439276627)
MannwhitneyuResult(statistic=1393807.0, pvalue=9.780850870734706e-05)
MannwhitneyuResult(statistic=602940.0, pvalue=0.002980911955877018)
MannwhitneyuResult(statistic=1240670.0, pvalue=1.0025508822275686e-16)
MannwhitneyuResult(statistic=656181.0, pvalue=2.1578556170069625e-10)
MannwhitneyuResult(statistic=1974236.5, pvalue=0.28221874418687287)
MannwhitneyuResult(statistic=2101005.0, pvalue=0.00012166453842038357)
MannwhitneyuResult(statistic=2086292.0, pvalue=0.0004908582587693855)
MannwhitneyuResult(statistic=1777798.5, pvalue=3.2681650095619783e-06)


# ALT RIGHT LINKS

In [12]:
#how many conversations have any link?
print(len(fb.convid[fb.num_all>0].unique())/len(fb.convid.unique()))

#how many conversations have an am link?
print(len(fb.convid[fb.num_am>0].unique())/len(fb.convid.unique()))

#how many conversations have a mm link?
print(len(fb.convid[fb.num_mm>0].unique())/len(fb.convid.unique()))


#how many unique users share am content?

print(len(fb.userid.unique()))
print(len(fb.userid[fb.num_am>0].unique()))
print(len(fb.userid[fb.num_am>0].unique())/len(fb.userid.unique()))
print(len(fb.userid.unique()))

0.5054271896829662
0.09274583557227298
0.28694250403009136
14366
1147
0.07984129193930113
14366
