In [None]:
import pandas as pd
import numpy as np
import math


In [None]:
#specify the time interval that will be used to create edges between users' messages in the threads

#used in the paper: 14, 27 and 54
interval = 27

In [None]:
#import raw data

path_to_json_file  = "fb.json"
fb=pd.read_json(path_to_json_file)
fb

In [None]:
# split the dataset into a list of threads and prepare it for subsequent manipulation

disc_list = []

for header in fb.header.unique():
    disc = fb[fb.header == header]
    disc = disc[["postid", "date", "calc_quoteid", "header", "num_am", "num_mm", "stance_label", "stance_prob"]]
    disc.sort_values(by = "date")
    disc = disc.reset_index(drop=True)
    disc = disc.assign (reply_type = pd.Series(["post"]*len(disc)))
    disc_list.append (disc)
    
    

In [None]:
# all threads - calculate time intervals between quoting and quoted posts in each of the discussion threads
# then create time stamped edgelists 


disc_el_list=[]

for j in range (0, len(disc_list)):
   
    el['calc_quoteid'] = el['calc_quoteid'].apply(lambda x:  list(filter(None, x[0])) if (len (x)>0 ) & (type (x[0]) == list) else x)
    el=el[el['calc_quoteid'].apply(lambda x: x != [""])]
    el = el.reset_index(drop=True)    
   
    el = el.assign (reply_type = pd.Series(["quote"]* len(el))) 
    el = el.explode ("calc_quoteid")
    el = el.reset_index (drop = True)    
    
    time_recepient = []
    num_am_recepient = []
    num_mm_recepient = []
    stance_label_recepient = []
    stance_prob_recepient = []    
    
    for i in range (0, len(el)):
        
    
        recepient = disc_list[j].postid[disc_list[j].postid == el.calc_quoteid[i]]
            
        if len(recepient)>0:
            
            t = disc_list[j].date [disc_list[j].postid == recepient.values[0]]
            l = disc_list[j].num_am [disc_list[j].postid == recepient.values[0]]
            m = disc_list[j].num_mm [disc_list[j].postid == recepient.values[0]]
            sl = disc_list[j].stance_label [disc_list[j].postid == recepient.values[0]]
            sp = disc_list[j].stance_prob [disc_list[j].postid == recepient.values[0]]
       
            time_recepient.append(t.values[0])
            num_am_recepient.append(l.values[0])    
            num_mm_recepient.append(m.values[0])  
            stance_label_recepient.append(sl.values[0])  
            stance_prob_recepient.append(sp.values[0])            
            
        else:
            time_recepient.append(np.nan)
            num_am_recepient.append(np.nan)    
            num_mm_recepient.append(np.nan)  
            stance_label_recepient.append(np.nan)  
            stance_prob_recepient.append(np.nan) 
    
    el = el.assign (time_recepient = pd.Series(time_recepient))
    el = el.assign (num_am_recepient = pd.Series(num_am_recepient))
    el = el.assign (num_mm_recepient = pd.Series(num_mm_recepient))
    el = el.assign (stance_label_recepient = pd.Series(stance_label_recepient))
    el = el.assign (stance_prob_recepient = pd.Series(stance_prob_recepient))

    el = el.drop_duplicates ()
    el = el.dropna()
    el = el.reset_index (drop = True)

    el = el.rename(columns={"postid": "sender", 
                            "calc_quoteid": "recepient", 
                            "date": "time_sender", 
                            "num_am": "num_am_sender",
                            "num_mm": "num_mm_sender",
                            "stance_label": "stance_label_sender",
                            "stance_prob": "stance_prob_sender"})
    
    disc_el_list.append(el)
 

### Build conversations 

In [None]:
#Build conversations 
#Calculate time interval between user replies in the threads 
#then sort out edges (message pairs) that do not fit the chosen interval 
#keep only edges created within the intreval of 14/27/54 minutes


mats = []
for k in range(0, len(disc_list)):
    
    disc_list[k].date = pd.to_datetime(disc_list[k].date, format="%Y-%m-%d %H:%M")
   
    dim = len(disc_list[k])

    mat = np.empty ((dim, dim))
    mat[:] = np.NaN

    for i in range(0, len(disc_list[k])):
        
        for j in range(i+1, len(disc_list[k])):
      
            t = disc_list[k].loc[j, "date"] - disc_list[k].loc[i, "date"]
            t = t.total_seconds() / 60
            
            if t<interval+1:
                mat[j, i] = t
                
            else:
                continue
    
    mats.append(mat)

In [None]:
#add metadata to the edgelists by looking it up in the raw dataset
#metadata: users' and recepients' stances, number and type of links shared, as well as edge type (quote or reply).

els = []

for k in range (0, len (mats)):

    print(k)

    mat = mats[k]
    disc = disc_list[k]
    
    senders = []
    recepients = []
    time_sender = []
    time_recepient = []
    types = []
    num_am_recepient = []
    num_am_sender = []
    
    num_mm_recepient = []
    num_mm_sender = []
    
    stance_label_recepient = []
    stance_prob_recepient = []
    
    stance_label_sender = []
    stance_prob_sender = []
    
    header = []

    for j in range(0, len (disc)):
    
        for i in range(0, len (disc)):
        
            if not math.isnan (mat [j, i]):
                time_sender.append(disc.date[j])
                senders.append(disc.postid[j])
                types.append(disc.reply_type[j])
                header.append(disc.header[j])
                num_am_sender.append (disc.num_am[j])
                num_mm_sender.append (disc.num_mm[j])
                stance_label_sender.append(disc.stance_label[j]) 
                stance_prob_sender.append(disc.stance_prob[j]) 
               
                time_recepient.append(disc.date[i])
                recepients.append(disc.postid[i])
                num_am_recepient.append(disc.num_am[i])
                num_mm_recepient.append(disc.num_mm[i])
                stance_label_recepient.append(disc.stance_label[i]) 
                stance_prob_recepient.append(disc.stance_prob[i]) 
                
    
    el = pd.DataFrame(list(zip(senders, recepients, time_sender, time_recepient, types, 
                               num_am_recepient, num_am_sender, num_mm_recepient, num_mm_sender,
                               stance_label_sender, stance_prob_sender, stance_label_recepient, stance_prob_recepient, header)),
               columns =['sender', 'recepient', 'time_sender', 'time_recepient', "reply_type",
                         "num_am_recepient", "num_am_sender", "num_mm_recepient", "num_mm_sender",
                               "stance_label_sender", "stance_prob_sender", "stance_label_recepient",  
                         "stance_prob_recepient", "header"])            

    el = el.append(disc_el_list[k])
    el = el.drop_duplicates()
    
    
    el = el.sort_values('time_sender')
    
    el = el.reset_index(drop = True)
    
    els.append(el)

In [None]:
#merge all conversation edges in one dataframe
el_df = pd.concat(els, axis=0)
el_df = el_df.reset_index (drop = True)
el_df


In [None]:
time_diff = el_df.time_sender - el_df.time_recepient
time_diff = time_diff.astype('timedelta64[m]')

In [None]:
el_df = el_df.assign (time_diff = pd.Series(time_diff))
el_df

In [None]:
el_df.to_csv("conversation_edgelist" + str(interval)+ "min.csv")