# User-level analysis of political subreddits

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import statsmodels.api as sm
import sqlite3
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from itertools import combinations

import warnings
warnings.filterwarnings('ignore')

from utils import ROOTPATH, build_df_year, generate_index
from backbone import disparity_filter

## load data

In [2]:
year = 2012
df = build_df_year(year, with_text=True, with_time=False, filter_null=True) # start with yearly analysis
generate_index()

from utils import USERS, SUBREDDITS, user2index, index2user, subr2index, index2subr

building base dataframe for year 2012...
- fetching submissions for year 2012
- fetching comments for year 2012
- number of entries: 5994991


In [4]:
# subsetting
active_user_ls = pd.read_csv(ROOTPATH + "indx/ACTIVEUSERS_ALLYEARS5_100.csv").author.tolist()  # smaller one
# active_user_ls = pd.read_csv(ROOTPATH + "indx/ACTIVEUSERS_ALLYEARS_2_2_50.csv").author.tolist()
df = df[df["author"].isin(active_user_ls)]

## build network

In [5]:
def build_user_graph(df):
    subm_aggr = df.groupby("submission_id").agg({"author": lambda x: set(list(x))}).reset_index()
    subm_aggr["n_user"] = subm_aggr["author"].map(lambda x: len(x))
    subm_aggr = subm_aggr[subm_aggr["n_user"]>1].reset_index().drop(columns=["index"])  
    
    user_edges = {}
    print("recording edges...")
    # directly writing to nx.Graph() kills the kernel, ugh
    for i,row in subm_aggr.iterrows():
        users = list(row["author"])
        if i%10000==0: print(" - progress:", i/len(subm_aggr))
        userpairs = [*combinations(users,2)]
        for i,up in enumerate(userpairs):
            u1 = min(user2index[up[0]],user2index[up[1]])
            u2 = max(user2index[up[0]],user2index[up[1]])
            try:
                user_edges[(u1, u2)] += 1
            except:
                user_edges[(u1, u2)] = 1
                
    print("building graph...")       
    userg = nx.Graph()
    for p,w in user_edges.items():
        userg.add_edge(p[0], p[1], weight=w)
    
    print("graph descriptives:")
    print(" - # of nodes:", len(userg.nodes()))
    print(" - # of edges:", len(userg.edges()))
    return user_edges, userg

In [None]:
user_edges, userg = build_user_graph(df)  # should take a while

In [9]:
# nx.write_gpickle(userg, ROOTPATH + f"output/user_graph_{year}.gpickle")

## degree and strength distribution

In [11]:
degs = dict(userg.degree()).values

In [None]:
strs = dict(userg.degree(weight="weight")).values

In [None]:
deg_seq = get_freq_seq(degs)
str_seq = get_freq_seq(strths)

In [None]:
plt.figure(figsize=(8,6), dpi=300) 
plt.loglog(range(len(deg_seq)), deg_seq, color="#ff4500", marker="^", label="degree")
plt.loglog(range(len(str_seq)), str_seq, "--", color="#7A9299", label="strength")
plt.legend(fontsize=14)
plt.xlabel("degree", fontsize=14)
plt.ylabel("frequency", fontsize=14)
plt.savefig(ROOTPATH + "output/conf_degree_distribution.png", dpi=300)

In [None]:
# import pickle
# with open(ROOTPATH + f"output/user_edges_{year}.pkl", "wb") as f:
#     pickle.dump(user_edges, f, protocol=pickle.HIGHEST_PROTOCOL)
    
# read edges: 
# with open(ROOTPATH + f"output/user_edges_{year}.pkl", "rb") as f:
#     user_edges = pickle.load(f)