In [1]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time
from tqdm import tqdm

from timeit import default_timer as timer

In [24]:
def epoch(year, month, day, **kwargs):
    date_time = dt.datetime(year, month, day, **kwargs)
    return int(date_time.timestamp())


def dataframe(psaw_result_generator):
    return pd.DataFrame([item.d_ for item in psaw_result_generator])


# Wrapper subclass to return results in Pandas DataFrames.
class DataframePushshiftAPI(psaw.PushshiftAPI):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def search_comments(self, **kwargs):
        result_gen = super().search_comments(**kwargs)
        return dataframe(result_gen)
    
    def search_submissions(self, **kwargs):
        result_gen = super().search_submissions(**kwargs)
        return dataframe(result_gen)
    
    # Subreddit endpoint is not working (https://github.com/pushshift/api/issues/40).
    # def search_subreddits(self, **kwargs):
    #     result_gen = self._search_func(kind='subreddit', **kwargs)
    #     return dataframe(result_gen)
    
    def redditor_subreddit_activity(self, author, **kwargs):
        result_gen = super().redditor_subreddit_activity(author, **kwargs)
        return dataframe(result_gen)

    
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')

In [25]:
pushshift = DataframePushshiftAPI()

In [26]:
def download_comments(after, before, limit, **kwargs):
    print(f'Downloading data ({kwargs})...')
    start = timer()
    df = pushshift.search_comments(after=after,
                                   before=before,
                                   **kwargs,
                                   sort='asc',
                                   sort_type='created_utc',
                                   filter=['author', 'subreddit'],
                                   limit=limit)
    end = timer()
    print('Finished!')
    print(f'Time elapsed: {end - start}s')
    return df


def download_subreddit_users(after, before, limit, subreddit_name):
    return download_comments(after, before, limit, subreddit=subreddit_name)


def download_user_comments(after, before, limit, author):
    return download_comments(after, before, limit, author=author)

In [14]:
def load_comments_from_file(file_path, limit=None):
    keys_to_keep = ['author', 'subreddit', 'subreddit_type', 'comment_type', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
    data = []
    with open(file_path, 'r') as f:
        count = 0
        for line in tqdm(f):
            try:
                j = json.loads(line)
                record = { k: j[k] for k in keys_to_keep }
                data.append(record)
                count += 1
                if limit and count > limit:
                    break
            except json.JSONDecodeError:
                break
    return pd.json_normalize(data)

In [21]:
comments = load_comments_from_file('D:\comments.txt', limit=3_000_000)
comments

3000000it [02:34, 19412.15it/s]


Unnamed: 0,author,subreddit,subreddit_type,comment_type,score,controversiality,created_utc,id,parent_id,body
0,xjt22,SquaredCircle,public,,1,0,1612137600,gljcihc,t1_gljcgdm,33
1,RiBread,HaircareScience,public,,6,0,1612137600,gljcihd,t1_gli1kmj,"Hey, would you mind sharing more information a..."
2,YhormTheWhite,AskReddit,public,,3,0,1612137600,gljcihe,t1_gljaznd,I wouldn't really say this is a question that ...
3,Eddiep88,AppleWatch,public,,1,0,1612137600,gljcihf,t3_l9lk45,How do you check this total activity. Sorry I’...
4,yummymario64,AceAttorney,public,,48,0,1612137600,gljcihh,t3_l9jfph,"I'm confused because like a third of ""If I saw..."
...,...,...,...,...,...,...,...,...,...,...
2999996,[deleted],vancouver,public,,1,0,1612178760,gll5vwk,t1_gll0qqf,[deleted]
2999997,[deleted],selfie,public,,2,0,1612178760,gll5vwl,t3_la00vk,Absolutely stunning
2999998,Brotherly-Moment,ParadoxExtra,public,,4,0,1612178760,gll5vwm,t1_gll5d4z,Life is a code when you really think about it.
2999999,-TH3MS-,WallStreetbetsELITE,public,,1,0,1612178760,gll5vwn,t1_gll5qhq,"**Current Short Volume:** 44,670,000 shares, ..."


In [30]:
subreddits = pd.read_csv('political_subreddits.csv', sep='\t')
top_political_subreddits = subreddits.sort_values('subscriber_rank')['name']
top_political_subreddits

0                  politics
1                conspiracy
2            PoliticalHumor
3              Conservative
4       LateStageCapitalism
5     PoliticalCompassMemes
6               Libertarian
7                ukpolitics
8                 socialism
9               geopolitics
10         moderatepolitics
11                 Feminism
12          Fuckthealtright
13           CanadaPolitics
14          ShitLiberalsSay
15         liberalgunowners
16          COMPLETEANARCHY
17             communism101
18          libertarianmeme
19                   Israel
20       AustralianPolitics
21               neoliberal
22                Palestine
23    SocialJusticeInAction
24                stupidpol
25                     Sino
26        ConservativeMemes
27                 LabourUK
28                GenZedong
Name: name, dtype: object

In [35]:
df = comments
df = df[df['subreddit'].isin(top_political_subreddits)]
pol_users = df['author'].unique()
pol_users = pol_users[pol_users != '[deleted]'] # Remove [deleted], a placeholder for removed accounts
num_users = len(pol_users)
pol_users

array(['PoliticsModeratorBot', 'HIGHestKARATE', 'lborsato', ...,
       'Havonasun', 'M1CAE1', 'ThermiteBurns'], dtype=object)

In [16]:
subreddits = pd.read_csv('political_subreddits.csv', sep='\t')
top_political_subreddits = subreddits.sort_values('subscriber_rank')['name']
num_subreddits = 10 # TODO: analyze more?
limit_per_sub = 200000 / num_subreddits
df = pd.DataFrame()
for i in range(num_subreddits):
    data = download_subreddit_users(epoch(2021, 2, 1), epoch(2021, 2, 2), limit_per_sub, top_political_subreddits[i])
    df = df.append(data)

Downloading data ({'subreddit': 'politics'})...
Finished!
Time elapsed: 267.98287230000005s
Downloading data ({'subreddit': 'conspiracy'})...




Finished!
Time elapsed: 139.67169930000023s
Downloading data ({'subreddit': 'PoliticalHumor'})...
Finished!
Time elapsed: 96.99201860000039s
Downloading data ({'subreddit': 'Conservative'})...
Finished!
Time elapsed: 155.39180439999973s
Downloading data ({'subreddit': 'LateStageCapitalism'})...
Finished!
Time elapsed: 12.631652400000348s
Downloading data ({'subreddit': 'PoliticalCompassMemes'})...
Finished!
Time elapsed: 256.62732849999975s
Downloading data ({'subreddit': 'Libertarian'})...
Finished!
Time elapsed: 28.266946899999766s
Downloading data ({'subreddit': 'ukpolitics'})...
Finished!
Time elapsed: 86.28062290000025s
Downloading data ({'subreddit': 'socialism'})...
Finished!
Time elapsed: 3.690469900000153s
Downloading data ({'subreddit': 'geopolitics'})...
Finished!
Time elapsed: 6.377479499999936s


In [None]:
subreddits = pd.read_csv('political_subreddits.csv', sep='\t')
top_political_subreddits = subreddits.sort_values('subscriber_rank')['name']
num_subreddits = 10 # TODO: analyze more?
limit_per_sub = 200000 / num_subreddits
df = pd.DataFrame()
for i in range(num_subreddits):
    data = download_subreddit_users(epoch(2021, 2, 1), epoch(2021, 2, 2), limit_per_sub, top_political_subreddits[i])
    df = df.append(data)

In [27]:
df = comments

In [17]:
df_original = df
df

Unnamed: 0,author,created_utc,subreddit,created
0,PoppyBongos,1612159201,politics,1.612177e+09
1,cat_is_cat,1612159203,politics,1.612177e+09
2,muraenae,1612159207,politics,1.612177e+09
3,taki1002,1612159208,politics,1.612177e+09
4,NOOO_GOD_NOOO,1612159209,politics,1.612177e+09
...,...,...,...,...
300,Berkyjay,1612244109,geopolitics,1.612262e+09
301,Acceptable-Window442,1612244142,geopolitics,1.612262e+09
302,Rimainder,1612244178,geopolitics,1.612262e+09
303,zninjamonkey,1612244783,geopolitics,1.612263e+09


In [28]:
users = df['author'].unique()
users = users[users != '[deleted]'] # Remove [deleted], a placeholder for removed accounts
num_users = len(users)
num_users

884640

In [47]:
def grouper_it(n, iterable):
    it = iter(iterable)
    while True:
        chunk_it = itertools.islice(it, n)
        try:
            first_el = next(chunk_it)
        except StopIteration:
            return
        yield itertools.chain((first_el,), chunk_it)

In [58]:
group_size = 200
limit_per_group = int(400000 / num_users * group_size)
pol_user_comments = pd.DataFrame()
for i, user_batch in enumerate(grouper_it(group_size, users)):
    print(f'({i + 1}/{num_users / group_size})')
    data = download_user_comments(epoch(2021, 2, 1), epoch(2021, 2, 2), limit_per_group, list(user_batch))
    pol_user_comments = pol_user_comments.append(data)

(1/151.19)
Downloading data ({'author': ['PoppyBongos', 'cat_is_cat', 'muraenae', 'taki1002', 'NOOO_GOD_NOOO', 'pass_nthru', 'chaogomu', 'Aggromemnon', 'dak4ttack', 'Kryzilla', 'Superman_Ultraprime', 'olmoscd', 'Jordanbryant623', 'gradientz', 'prime2424', 'fillinthe___', 'AutoModerator', 'What_Who_Where', 'hismaj45', 'Regular_Piccolo7980', 'sweetlew07', 'VadPuma', 'The_Fruit_Bat', 'wayoverpaid', 'No_Credibility', 'basicwithflair', 'tickitytalk', 'MoonaSky89', 'Jtizzle1231', 'mindfu', 'DemWitty', 'gulliver-swift', 'Soft-Professional527', 'TakeshiKovacsSleeve3', 'DepopulationXplosion', 'FingersAtLarge22', 'Easer123456', 'Vakieh', 'DaisyHotCakes', 'vohi', 'slateuse', 'Serenity101', 'FortySixAndYou', 'Dyerssorrow', 'Curve_of_Spee', 'SamGropler', 'DamienChazellesPiano', 'sadistic_tendencies', 'notonyanellymate', 'DiamondGunner520', 'thatnameagain', 'DjImagin', 'kreton1', 'nunboi', 'Luke-E-Fur', 'raggadus', 'Gallijl3', 'pottman', 'HertzDonut1001', 'Bibi77410X', 'dominarhexx', 'theseeyesarefl



Finished!
Time elapsed: 23.356774899999436s
(3/151.19)
Downloading data ({'author': ['Mutexception', 'Joverol', 'rudecanuck', 'Dtknightt', 'Tarifaer', 'topherus_maximus', 'rabidwombat', 'kimchi_Queen', 'lazarusmobile', 'GabuEx', 'shadowninja2_0', 'mtdewelf', 'TeePeeBee3', 'jeffinRTP', 'JCB2K', 'FashionBusking', 'Crayon_Connoisseur', 'TheRealPapaDan', 'neuroverdant', 'NuQ', 'Dankoon221', 'MrP1anet', 'Smokester_', 'Micheal_Hancho', 'Muffhounds', 'Viscumin', 'Slapbox', 'tyranicalteabagger', 'x_cLOUDDEAD_x', 'rundesirerun', 'IDoThingsOnWhims', 'el_supreme_duderino', 'Low-Advance8570', 'ramblinggambling', 'Unchosen_Heroes', 'CommanderEager', 'IzzyIzumi', 'smick', 'nucflashevent', 'System-Tough', 'GroceryRobot', 'LSF604', 'Mays2020plus', 'gjp11', '_bleeding_Hemorrhoid', 'xerxesanonymous', 'Deranged_Qultist', 'iamfuckingmoron', 'mtndewgood', 'dingox01', 'MalapropRhetoric', 'cl3arlycanadian', 'fullforce098', 'DivestInWallStreet', 'TummyDrums', 'Tylariel', 'ascii122', 'jaxsonnz', 'GoodAtWreckin

In [59]:
puc_original = pol_user_comments
pol_user_comments

Unnamed: 0,author,created_utc,subreddit,created
0,PoppyBongos,1612159201,politics,1.612177e+09
1,AutoModerator,1612159202,losangelespersonals,1.612177e+09
2,AutoModerator,1612159202,happy,1.612177e+09
3,AutoModerator,1612159202,AskWomen,1.612177e+09
4,AutoModerator,1612159203,timestop,1.612177e+09
...,...,...,...,...
183,Acceptable-Window442,1612244142,geopolitics,1.612262e+09
184,Plus-Feature,1612244173,rust,1.612262e+09
185,Rimainder,1612244178,geopolitics,1.612262e+09
186,Rimainder,1612244606,AsiaReport,1.612263e+09


In [60]:
pol_user_comments = pol_user_comments[['author', 'subreddit']]
grouped_by_sub = group_subreddits_by_author(pol_user_comments)
grouped_by_sub

Unnamed: 0,author,subreddits
0,---------_----_---_,"{technology, news, ukpolitics, politics}"
1,------sb,{Conservative}
2,----GaLaXy----,"{WorldOfTanksBlitz, PoliticalCompassMemes}"
3,---midnight_rain---,{conspiracy}
4,--MrPresident,"{Conservative, CoronavirusCirclejerk}"
...,...,...
30173,zzwugz,"{menwritingwomen, Showerthoughts, quityourbull..."
30174,zzxvvm,"{iamverybadass, niceguys, politics, stocks, Po..."
30175,zzzamzamm,"{TwoXChromosomes, unitedkingdom, Libertarian, ..."
30176,zzzatan,"{cock, conspiracy}"


In [61]:
G = build_subreddit_shared_author_graph(pol_user_comments)
print(f'Built graph for {len(G.nodes())} subreddits.')
file_name = 'political_subreddit_network.gexf'
export_to_gephi_file(G, file_name)
print(f'Exported graph to: {file_name}')

Built graph for 12475 subreddits.
Exported graph to: political_subreddit_network.gexf
