In [1]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time
from tqdm import tqdm

from timeit import default_timer as timer

In [2]:
def epoch(year, month, day, **kwargs):
    date_time = dt.datetime(year, month, day, **kwargs)
    return int(date_time.timestamp())


def dataframe(psaw_result_generator):
    return pd.DataFrame([item.d_ for item in psaw_result_generator])


# Wrapper subclass to return results in Pandas DataFrames.
class DataframePushshiftAPI(psaw.PushshiftAPI):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def search_comments(self, **kwargs):
        result_gen = super().search_comments(**kwargs)
        return dataframe(result_gen)
    
    def search_submissions(self, **kwargs):
        result_gen = super().search_submissions(**kwargs)
        return dataframe(result_gen)
    
    # Subreddit endpoint is not working (https://github.com/pushshift/api/issues/40).
    # def search_subreddits(self, **kwargs):
    #     result_gen = self._search_func(kind='subreddit', **kwargs)
    #     return dataframe(result_gen)
    
    def redditor_subreddit_activity(self, author, **kwargs):
        result_gen = super().redditor_subreddit_activity(author, **kwargs)
        return dataframe(result_gen)

    
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')

In [3]:
pushshift = DataframePushshiftAPI()

In [4]:
def download_comments(after, before, limit, **kwargs):
    print(f'Downloading data ({kwargs})...')
    start = timer()
    df = pushshift.search_comments(after=after,
                                   before=before,
                                   **kwargs,
                                   sort='asc',
                                   sort_type='created_utc',
                                   filter=['author', 'subreddit'],
                                   limit=limit)
    end = timer()
    print('Finished!')
    print(f'Time elapsed: {end - start}s')
    return df


def download_subreddit_users(after, before, limit, subreddit_name):
    return download_comments(after, before, limit, subreddit=subreddit_name)


def download_user_comments(after, before, limit, author):
    return download_comments(after, before, limit, author=author)


def load_comments_from_file(file_path, limit=None):
    keys_to_keep = ['author', 'subreddit', 'subreddit_type', 'comment_type', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
    data = []
    with open(file_path, 'r') as f:
        count = 0
        for line in tqdm(f):
            try:
                j = json.loads(line)
                record = { k: j[k] for k in keys_to_keep }
                data.append(record)
                count += 1
                if limit and count > limit:
                    break
            except json.JSONDecodeError:
                break
    return pd.json_normalize(data)

In [5]:
comments = load_comments_from_file('D:\comments.txt', limit=6_000_000)
comments

3296784it [03:18, 16615.84it/s]


Unnamed: 0,author,subreddit,subreddit_type,comment_type,score,controversiality,created_utc,id,parent_id,body
0,xjt22,SquaredCircle,public,,1,0,1612137600,gljcihc,t1_gljcgdm,33
1,RiBread,HaircareScience,public,,6,0,1612137600,gljcihd,t1_gli1kmj,"Hey, would you mind sharing more information a..."
2,YhormTheWhite,AskReddit,public,,3,0,1612137600,gljcihe,t1_gljaznd,I wouldn't really say this is a question that ...
3,Eddiep88,AppleWatch,public,,1,0,1612137600,gljcihf,t3_l9lk45,How do you check this total activity. Sorry I’...
4,yummymario64,AceAttorney,public,,48,0,1612137600,gljcihh,t3_l9jfph,"I'm confused because like a third of ""If I saw..."
...,...,...,...,...,...,...,...,...,...,...
3296779,bdbxwz,dragonage,public,,0,0,1612183536,gllcc7t,t1_glkqx2f,Personally? Because I think dwarves are ugly.\...
3296780,[deleted],wallstreetbets,public,,1,0,1612183536,gllcc7u,t3_la21yr,[removed]
3296781,vertukv,2b2t,public,,104,0,1612183536,gllcc7v,t3_la0zxm,Popbob secret base
3296782,patosdon,LogitechG,public,,1,0,1612183536,gllcc7w,t3_la0gso,"Have you tried turning off the mouse, unpluggi..."


In [6]:
subreddits = pd.read_csv('political_subreddits.csv', sep='\t')
top_political_subreddits = subreddits.sort_values('subscriber_rank')['name']
top_political_subreddits

0                  politics
1                conspiracy
2            PoliticalHumor
3              Conservative
4       LateStageCapitalism
5     PoliticalCompassMemes
6               Libertarian
7                ukpolitics
8                 socialism
9               geopolitics
10         moderatepolitics
11                 Feminism
12          Fuckthealtright
13           CanadaPolitics
14          ShitLiberalsSay
15         liberalgunowners
16          COMPLETEANARCHY
17             communism101
18          libertarianmeme
19                   Israel
20       AustralianPolitics
21               neoliberal
22                Palestine
23    SocialJusticeInAction
24                stupidpol
25                     Sino
26        ConservativeMemes
27                 LabourUK
28                GenZedong
Name: name, dtype: object

In [7]:
def label_users(df, pol_subs):
    gp_by_sub = group_subreddits_by_author(df[['author', 'subreddit']])
    for sub in pol_subs:
        other_subs = {s for s in pol_subs if s != sub}
        for i, row in gp_by_sub.iterrows():
            ss = row['subreddits']
            if sub in ss and ss.isdisjoint(other_subs):
                gp_by_sub.at[i, 'political_label'] = sub
    return gp_by_sub

In [8]:
pol_users = label_users(comments, set(['Conservative', 'neoliberal']))

In [9]:
pol_users.set_index('author', inplace=True)
pol_users

Unnamed: 0_level_0,subreddits,political_label
author,Unnamed: 1_level_1,Unnamed: 2_level_1
--------------Emkay,"{apexlegends, teenagers}",
-----------___,{me_irl},
---------_----_---_,"{news, politics}",
--------V--------,"{nba, leagueoflegends}",
-------2-------,"{CrusaderKings, Drugs}",
...,...,...
zzzzzzysbwbwb,{AskReddit},
zzzzzzziimmm,{CovIdiots},
zzzzzzzzzz55,"{prettyaltgirls, panties}",
zzzzzzzzzzzzvzzzzvzz,{TikTokCringe},


In [10]:
comments.set_index('id', inplace=True)
comments

Unnamed: 0_level_0,author,subreddit,subreddit_type,comment_type,score,controversiality,created_utc,parent_id,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gljcihc,xjt22,SquaredCircle,public,,1,0,1612137600,t1_gljcgdm,33
gljcihd,RiBread,HaircareScience,public,,6,0,1612137600,t1_gli1kmj,"Hey, would you mind sharing more information a..."
gljcihe,YhormTheWhite,AskReddit,public,,3,0,1612137600,t1_gljaznd,I wouldn't really say this is a question that ...
gljcihf,Eddiep88,AppleWatch,public,,1,0,1612137600,t3_l9lk45,How do you check this total activity. Sorry I’...
gljcihh,yummymario64,AceAttorney,public,,48,0,1612137600,t3_l9jfph,"I'm confused because like a third of ""If I saw..."
...,...,...,...,...,...,...,...,...,...
gllcc7t,bdbxwz,dragonage,public,,0,0,1612183536,t1_glkqx2f,Personally? Because I think dwarves are ugly.\...
gllcc7u,[deleted],wallstreetbets,public,,1,0,1612183536,t3_la21yr,[removed]
gllcc7v,vertukv,2b2t,public,,104,0,1612183536,t3_la0zxm,Popbob secret base
gllcc7w,patosdon,LogitechG,public,,1,0,1612183536,t3_la0gso,"Have you tried turning off the mouse, unpluggi..."


In [11]:
def build_political_user_reply_graph(comments, users):
    pol_users = users[users['political_label'].notna()].index
    G = nx.DiGraph()
    for i, comment in tqdm(comments.iterrows()):
        user1 = comment['author']
        if user1 not in pol_users:
            continue
        typed_parent_id = comment['parent_id']
        if not typed_parent_id.startswith('t1_'): # Comment
            continue
        parent_id = typed_parent_id[3:]
        if parent_id not in comments.index:
            continue
        user2 = comments.loc[parent_id]['author']
        sub = comment['subreddit']
        if G.has_edge(user1, user2):
            G[user1][user2]['weight'] += 1
            G[user1][user2]['subreddits'].add(sub)
        else:
            G.add_edge(user1, user2, weight=1, subreddits=set([sub]))
    for node in G.nodes:
        G.nodes[node]['political_label'] = users.loc[node]['political_label']
    for a, b in G.edges:
        G[a][b]['subreddits'] = ",".join(G[a][b]['subreddits'])
    return G

In [12]:
G = build_political_user_reply_graph(comments, pol_users)

3296784it [02:29, 22009.52it/s]


In [13]:
export_to_gephi_file(G, './political_reply_network.gexf')

array([nan, 'Conservative', 'neoliberal'], dtype=object)

In [48]:
groups = pol_users['political_label'].unique()
group_mtx = pd.DataFrame(index=groups, columns=groups)
for group in groups:
    for group2 in groups:
        group_mtx.at[group, group2] = 0
group_mtx

Unnamed: 0,NaN,Conservative,neoliberal
,0,0,0
Conservative,0,0,0
neoliberal,0,0,0


In [49]:
for a, b in G.edges:
    i = G.nodes[a]['political_label']
    col = G.nodes[b]['political_label']
    group_mtx.at[i, col] += 1

In [50]:
group_mtx['total'] = group_mtx.sum(axis=1)
group_mtx = group_mtx[group_mtx.index.notnull()]
group_mtx

Unnamed: 0,NaN,Conservative,neoliberal,total
Conservative,1860,1448,4,3312.0
neoliberal,1404,2,2493,3899.0


In [51]:
for group in groups:
    group_mtx[f'{group}_percent'] = group_mtx[group].divide(group_mtx['total'])
group_mtx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_mtx[f'{group}_percent'] = group_mtx[group].divide(group_mtx['total'])


Unnamed: 0,NaN,Conservative,neoliberal,total,nan_percent,Conservative_percent,neoliberal_percent
Conservative,1860,1448,4,3312.0,0.561594,0.437198,0.001208
neoliberal,1404,2,2493,3899.0,0.360092,0.000513,0.639395
