# Create Daily Networks

This notebook creates a Reddit user reply network for each day from Sep-Nov 2012.

**You must download and unzip the September, November, and December comment files
from https://files.pushshift.io/reddit/comments/ before running this notebook.
You must name them and place them as follows:**
- `./data/RC_2012-09`
- `./data/RC_2012-10`
- `./data/RC_2012-11`

**You must manually create the following directories before running this notebook:**
- `./data/RC_2012-09_daily/`
- `./data/RC_2012-10_daily/`
- `./data/RC_2012-11_daily/`
- `./data/RC_2012-09_daily_graphs/`
- `./data/RC_2012-10_daily_graphs/`
- `./data/RC_2012-11_daily_graphs/`

## Imports

In [6]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time
from tqdm import tqdm

import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from timeit import default_timer as timer
from pathlib import Path
from calendar import Calendar
import calendar

## Functions

In [33]:
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')
            

def label_users(df, pol_subs):
    gp_by_sub = group_subreddits_by_author(df[['author', 'subreddit']])
    for sub in pol_subs:
        other_subs = {s for s in pol_subs if s != sub}
        for i, row in gp_by_sub.iterrows():
            ss = row['subreddits']
            if sub in ss and ss.isdisjoint(other_subs):
                gp_by_sub.at[i, 'political_label'] = sub
    return gp_by_sub


def load_comments_from_files(file_paths):
    dfs = []
    for file_path in file_paths:
        dfs.append(load_comments_from_file(file_path))
    return pd.concat(dfs)


def load_comments_from_file(file_path, limit=None):
    keys_to_keep = ['author', 'subreddit', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
    data = []
    with open(file_path, 'r') as f:
        count = 0
        for line in tqdm(f):
            try:
                j = json.loads(line)
                record = { k: j[k] for k in keys_to_keep }
                data.append(record)
                count += 1
                if limit and count > limit:
                    break
            except json.JSONDecodeError:
                break
    df = pd.json_normalize(data)
    df[['score', 'controversiality', 'created_utc']] = df[['score', 'controversiality', 'created_utc']].apply(pd.to_numeric, downcast="float")
    return df


# function to remove punctuation from text (input is a string)
def clean_text(s):
	return "".join(l for l in str(s) if l not in string.punctuation)
    
    
def get_sub_comments(pol_users, comments, subs):
    sub_users = {sub: pol_users[pol_users['political_label'] == sub].index for sub in subs}
    pcids = {sub: [] for sub in subs}
    for comment in tqdm(comments.itertuples()):
        for sub in subs:
            if getattr(comment, 'author') in sub_users[sub] and getattr(comment, 'subreddit') == sub:
                pcids[sub].append(getattr(comment, 'Index'))
    return pcids


def save_comments_by_day(src, dest, month, year = 2012):
    num_days = calendar.monthrange(year, month)[1]
    day_starts = []
    day_files = []
    day_writers = []
    keys_to_keep = ['author', 'subreddit', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
     
    for day in range(1, num_days + 1):
        start_of_day = int(dt.datetime(year, month, day, tzinfo=dt.timezone.utc).timestamp())
        print(start_of_day)
        day_starts.append(start_of_day)
        day_file = open(f'{dest}{day}.csv', 'w', newline='')
        day_files.append(day_file)
        day_writer = csv.writer(day_file, delimiter=',', quotechar='"')
        day_writer.writerow(keys_to_keep)
        day_writers.append(day_writer)
    
    with open(src, 'r') as f:
        for line in tqdm(f):
            try:
                j = json.loads(line)
                j['created_utc'] = int(j['created_utc'])
                i = len(day_starts) - 1
                while (j['created_utc'] < day_starts[i]):
                    i = i - 1
                record = [ j[k] for k in keys_to_keep ]
                day_writers[i].writerow(record)
            except json.JSONDecodeError:
                continue
    
    for day_file in day_files:
        day_file.close()
        
        
def get_bow_models(comments, sub_comments):
    models = dict()
    for sub, cids in sub_comments.items():
        corpus = comments.loc[cids]['body'].apply(clean_text)
        model = TfidfVectorizer()
        model.fit(corpus)
        models[sub] = model
    return models


def load_comments(month):
    comments = pd.DataFrame()
    for i in tqdm(range(1, 32)):
        try:
            comments = comments.append(pd.read_csv(f'./data/RC_2012-{month}_daily/{i}.csv'))
        except FileNotFoundError:
            pass
    comments.set_index('id', inplace=True)
    return comments


def get_pu_label(user):
        try:
            return pol_users.at[user, 'political_label']
        except KeyError:
            return None
    

def analyze_users(month, comments=None):
    if comments is None:
        comments = load_comments(month)
    
    pol_subs = ['Conservative', 'Liberal']
    pol_users = label_users(comments, set(pol_subs))
    pol_users.set_index('author', inplace=True)
    
    sub_comments = get_sub_comments(pol_users, comments, pol_subs)
    bow_models = get_bow_models(comments, sub_comments)
    
    sub_limit = 5
    subs_of_interest = pd.read_csv('./subreddits_of_interest.csv')
    subs_of_interest = set(subs_of_interest.sort_values('submission_amount', ascending=False)['subreddit'].head(sub_limit))
    subs_of_interest = subs_of_interest | set(pol_subs)
    
    pol_users['political_label'] = pol_users['political_label'].fillna('unaffiliated')
    pol_users.to_csv(f'users_{month}.csv')

    power_users = pd.read_csv('power_users.csv')
    power_users['label'] = power_users['user'].apply(get_pu_label)
    power_users = power_users[power_users['label'].notnull()]
    power_users.to_csv(f'power_users_{month}.csv')
    
    return (pol_users, bow_models, subs_of_interest)


def build_political_user_reply_graph(comments, users, bow_models, subs_of_interest):
    pol_users = dict()
    for i, user in users.iterrows():
        if not user['political_label'] == 'unaffiliated':
            pol_users[i] = user['political_label']
    G = nx.DiGraph()
    for comment in comments.itertuples():
        i = getattr(comment, 'Index')
        user1 = getattr(comment, 'author')
        csub = getattr(comment, 'subreddit')
        if csub not in subs_of_interest:
            continue
        typed_parent_id = getattr(comment, 'parent_id')
        if not typed_parent_id.startswith('t1_'): # Comment
            continue
        parent_id = typed_parent_id[3:]
        if parent_id not in comments.index:
            continue
        parent = comments.loc[parent_id]
        user2 = parent['author']
        
        for sub, model in bow_models.items():
            body = getattr(comment, 'body')
            clean = clean_text(body)

            parent_body = parent['body']
            parent_clean = clean_text(parent_body)

            bows = model.transform([clean, parent_clean]).toarray() # TODO: operate on sparse version (no toarray)?
            bow = bows[0]
            parent_bow = bows[1]

            similarity = cosine_similarity(bows)[1][0]
            pol_score = sum(bow) / len(bow)
            
            sim_key = f'{sub}_sim'
            pol_key = f'{sub}_pol'
            if G.has_edge(user1, user2):
                if sim_key in G[user1][user2]:
                    G[user1][user2][sim_key].append(similarity)
                else:
                    G[user1][user2][sim_key] = [similarity]
                if pol_key in G[user1][user2]:                    
                    G[user1][user2][pol_key].append(pol_score)
                else:
                    G[user1][user2][pol_key] = [pol_score]
            else:
                G.add_edge(user1, user2, weight=0, subreddits=set(), **{f'{sub}_sim': [similarity], f'{sub}_pol': [pol_score]})
            
        if G.has_edge(user1, user2):
            G[user1][user2]['weight'] += 1
            G[user1][user2]['subreddits'].add(csub)
        else:
            G.add_edge(user1, user2, weight=1, subreddits=set([csub]))
            
    for node in G.nodes:
        G.nodes[node]['political_label'] = users.at[node, 'political_label']
    for a, b in G.edges:
        G[a][b]['subreddits'] = ",".join(G[a][b]['subreddits'])
        for sub in bow_models.keys():
            G[a][b][f'avg_{sub}_sim'] = np.average(G[a][b][f'{sub}_sim'])
            G[a][b][f'avg_{sub}_pol'] = np.average(G[a][b][f'{sub}_pol'])
    return G

## Split Data

This cell splits the Pushshift comment data files for Sep, Oct, and Nov into daily comment data files so we can save memory by loading one day at a time when creating the networks.

In [None]:
# Must manually create the three "_daily" directories before running this cell
for comment_file_name, dest, month in [
    ('./data/RC_2012-09', './data/RC_2012-09_daily/', 9),
    ('./data/RC_2012-10', './data/RC_2012-10_daily/', 10),
    ('./data/RC_2012-11', './data/RC_2012-11_daily/', 11)
]:
    save_comments_by_day(comment_file_name, dest, month)

## Create Networks

For each month, this cell analyzes the users and assigns them political labels, then creates a bag-of-words model for each labeled group. With these models, it creates user reply networks for each day which include data about the political language used in the replies, as measured by the bag-of-words models.

In [30]:
# Must manually create the three "_daily_graphs" directories before running this cell
for month in ['09', '10', '11']:
    pol_users, bow_models, subs_of_interest = analyze_users(month)
    for i in tqdm(range(1, 32)):
        try:
            comments = pd.read_csv(f'./data/RC_2012-{month}_daily/{i}.csv')
            comments.set_index('id', inplace=True)
            G = build_political_user_reply_graph(comments, pol_users, bow_models, subs_of_interest)
            for a, b in G.edges:
                for sub in bow_models.keys():
                    del G[a][b][f'{sub}_sim']
                    del G[a][b][f'{sub}_pol']
            nx.readwrite.gexf.write_gexf(G, f'./data/RC_2012-{month}_daily_graphs/{i}.gexf')
        except FileNotFoundError:
            pass