In [1]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time

from timeit import default_timer as timer

In [22]:
def epoch(year, month, day, **kwargs):
    date_time = dt.datetime(year, month, day, **kwargs)
    return int(date_time.timestamp())


def dataframe(psaw_result_generator):
    return pd.DataFrame([item.d_ for item in psaw_result_generator])


# Wrapper subclass to return results in Pandas DataFrames.
class DataframePushshiftAPI(psaw.PushshiftAPI):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def search_comments(self, **kwargs):
        result_gen = super().search_comments(**kwargs)
        return dataframe(result_gen)
    
    def search_submissions(self, **kwargs):
        result_gen = super().search_submissions(**kwargs)
        return dataframe(result_gen)
    
    # Subreddit endpoint is not working (https://github.com/pushshift/api/issues/40).
    # def search_subreddits(self, **kwargs):
    #     result_gen = self._search_func(kind='subreddit', **kwargs)
    #     return dataframe(result_gen)
    
    def redditor_subreddit_activity(self, author, **kwargs):
        result_gen = super().redditor_subreddit_activity(author, **kwargs)
        return dataframe(result_gen)

    
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')

In [3]:
pushshift = DataframePushshiftAPI()

In [6]:
print('Downloading data...')
start = timer()
df = pushshift.search_comments(after=epoch(2017, 2, 1),
                               before=epoch(2017, 2, 2),
                               sort='asc',
                               sort_type='created_utc',
                               filter=['url','author', 'title', 'subreddit'],
                               limit=200000)
end = timer()
print('Finished!')
print(f'Time elapsed: {end - start}s')

Downloading data...
Finished!
Time elapsed: 400.0794327s


In [7]:
df_original = df
df

Unnamed: 0,author,created_utc,subreddit,created,url
0,Get_a_grip_pls,1485928801,MMA,1.485947e+09,
1,osrs_butt_plug,1485928801,2007scape,1.485947e+09,
2,Genuine_CoxComb,1485928801,MensRights,1.485947e+09,
3,ZapActions-dower,1485928801,homestuck,1.485947e+09,
4,Ottzoa,1485928801,relationships,1.485947e+09,
...,...,...,...,...,...
199942,Eddzi,1485939355,Overwatch,1.485957e+09,
199943,AutoModerator,1485939355,Drugs,1.485957e+09,
199944,ninja_throwawai,1485939355,AskReddit,1.485957e+09,
199945,WWWallace71,1485939355,combinedarms,1.485957e+09,


In [8]:
df = df[['author', 'subreddit']]
grouped_by_sub = group_subreddits_by_author(df)
grouped_by_sub

Unnamed: 0,author,subreddits
0,-----iMartijn-----,{AskThe_Donald}
1,-----username-----,{freedommobile}
2,----MXE----,{researchchemicals}
3,---E,{pcmasterrace}
4,---Earth---,"{spelunky, TownofSalemgame, pcmasterrace}"
...,...,...
96860,zzziiinnnggg,{daria}
96861,zzzjordy,{gaming}
96862,zzzpotatozzz,{fairytail}
96863,zzzzz94,"{AskEconomics, gifs, badeconomics}"


In [23]:
G = build_subreddit_shared_author_graph(df)
print(f'Built graph for {len(G.nodes())} subreddits.')
file_name = 'subreddit_network.gexf'
export_to_gephi_file(G, file_name)
print(f'Exported graph to: {file_name}')

Built graph for 6337 subreddits.
Exported graph to: subreddit_network.gexf
