In [1]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time
from tqdm import tqdm

from timeit import default_timer as timer

In [2]:
def epoch(year, month, day, **kwargs):
    date_time = dt.datetime(year, month, day, **kwargs)
    return int(date_time.timestamp())


def dataframe(psaw_result_generator):
    return pd.DataFrame([item.d_ for item in psaw_result_generator])


# Wrapper subclass to return results in Pandas DataFrames.
class DataframePushshiftAPI(psaw.PushshiftAPI):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def search_comments(self, **kwargs):
        result_gen = super().search_comments(**kwargs)
        return dataframe(result_gen)
    
    def search_submissions(self, **kwargs):
        result_gen = super().search_submissions(**kwargs)
        return dataframe(result_gen)
    
    # Subreddit endpoint is not working (https://github.com/pushshift/api/issues/40).
    # def search_subreddits(self, **kwargs):
    #     result_gen = self._search_func(kind='subreddit', **kwargs)
    #     return dataframe(result_gen)
    
    def redditor_subreddit_activity(self, author, **kwargs):
        result_gen = super().redditor_subreddit_activity(author, **kwargs)
        return dataframe(result_gen)

    
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')

In [3]:
pushshift = DataframePushshiftAPI()

In [4]:
def download_comments(after, before, limit, **kwargs):
    print(f'Downloading data ({kwargs})...')
    start = timer()
    df = pushshift.search_comments(after=after,
                                   before=before,
                                   **kwargs,
                                   sort='asc',
                                   sort_type='created_utc',
                                   filter=['author', 'subreddit'],
                                   limit=limit)
    end = timer()
    print('Finished!')
    print(f'Time elapsed: {end - start}s')
    return df


def download_subreddit_users(after, before, limit, subreddit_name):
    return download_comments(after, before, limit, subreddit=subreddit_name)


def download_user_comments(after, before, limit, author):
    return download_comments(after, before, limit, author=author)


def load_comments_from_files(file_paths):
    dfs = []
    for file_path in file_paths:
        dfs.append(load_comments_from_file(file_path))
    return pd.concat(dfs)


def load_comments_from_file(file_path, limit=None):
    keys_to_keep = ['author', 'subreddit', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
    data = []
    with open(file_path, 'r') as f:
        count = 0
        for line in tqdm(f):
            try:
                j = json.loads(line)
                record = { k: j[k] for k in keys_to_keep }
                data.append(record)
                count += 1
                if limit and count > limit:
                    break
            except json.JSONDecodeError:
                break
    df = pd.json_normalize(data)
    df[['score', 'controversiality', 'created_utc']] = df[['score', 'controversiality', 'created_utc']].apply(pd.to_numeric, downcast="float")
    return df

In [5]:
comments = load_comments_from_file('./data/RC_2012-09', limit=10_000_000)
comments

10000000it [01:29, 112332.69it/s]


Unnamed: 0,author,subreddit,score,controversiality,created_utc,id,parent_id,body
0,[deleted],AskReddit,1.0,0.0,1.346458e+09,c61pckd,t3_z5u9q,[deleted]
1,tomcat0071,gifs,1.0,0.0,1.346458e+09,c61pcke,t3_z4zfj,Where does a 500lb (227 kg) cat sit?\n\nWhere ...
2,ronearc,AskReddit,1.0,0.0,1.346458e+09,c61pckf,t3_z4rqt,"Hmm, if I could go back to April of '85 instea..."
3,PzGren,dayz,1.0,0.0,1.346458e+09,c61pckg,t1_c61p0h5,"nooo, reddit wont let me post!"
4,beercan_dan,tattoos,1.0,0.0,1.346458e+09,c61pcki,t3_z05s0,who was the artist?
...,...,...,...,...,...,...,...,...
9999996,mollaby38,todayilearned,16.0,0.0,1.347560e+09,c67p37n,t1_c67ook6,It's true that most planets in the universe lo...
9999997,Hakkz,Diablo,-4.0,0.0,1.347560e+09,c67p37o,t1_c67neuy,My demon hunter can farm act 3 and I only spen...
9999998,reallifeminifig,AskReddit,3.0,0.0,1.347560e+09,c67p37p,t1_c67ljtd,I've always found if you're in a rut in life a...
9999999,Black-Epiphany,movies,2.0,0.0,1.347560e+09,c67p37q,t1_c67nvv3,This is the best horror movie in my opinion. S...


In [6]:
subreddits = pd.read_csv('political_subreddits.csv', sep='\t')
top_political_subreddits = subreddits.sort_values('subscriber_rank')['name']
top_political_subreddits

0                  politics
1                conspiracy
2            PoliticalHumor
3              Conservative
4       LateStageCapitalism
5     PoliticalCompassMemes
6               Libertarian
7                ukpolitics
8                 socialism
9               geopolitics
10         moderatepolitics
11                 Feminism
12          Fuckthealtright
13           CanadaPolitics
14          ShitLiberalsSay
15         liberalgunowners
16          COMPLETEANARCHY
17             communism101
18          libertarianmeme
19                   Israel
20       AustralianPolitics
21               neoliberal
22                Palestine
23    SocialJusticeInAction
24                stupidpol
25                     Sino
26        ConservativeMemes
27                 LabourUK
28                GenZedong
Name: name, dtype: object

In [7]:
def label_users(df, pol_subs):
    gp_by_sub = group_subreddits_by_author(df[['author', 'subreddit']])
    for sub in pol_subs:
        other_subs = {s for s in pol_subs if s != sub}
        for i, row in gp_by_sub.iterrows():
            ss = row['subreddits']
            if sub in ss and ss.isdisjoint(other_subs):
                gp_by_sub.at[i, 'political_label'] = sub
    return gp_by_sub

In [8]:
pol_users = label_users(comments, set(['Conservative', 'Liberal']))

In [9]:
pol_users.set_index('author', inplace=True)
pol_users

Unnamed: 0_level_0,subreddits,political_label
author,Unnamed: 1_level_1,Unnamed: 2_level_1
---,"{MensRights, 3ch, offbeat, Nexus7, gonewild, t...",
----_----,"{ECE, science, skeptic, conspiratard}",
---1---,{AndroidQuestions},
---blade---,"{dayz, circlejerk}",
---ooo---,{funny},
...,...,...
zzzz0101,"{VolleyballGirls, videos, photoshopbattles}",
zzzzop,"{ChannelAwesome, funny, Minecraft, GlobalOffen...",
zzzzwhat,{AskReddit},
zzzzygote,"{AskReddit, BackYardChickens}",


In [10]:
comments.set_index('id', inplace=True)
comments

Unnamed: 0_level_0,author,subreddit,score,controversiality,created_utc,parent_id,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c61pckd,[deleted],AskReddit,1.0,0.0,1.346458e+09,t3_z5u9q,[deleted]
c61pcke,tomcat0071,gifs,1.0,0.0,1.346458e+09,t3_z4zfj,Where does a 500lb (227 kg) cat sit?\n\nWhere ...
c61pckf,ronearc,AskReddit,1.0,0.0,1.346458e+09,t3_z4rqt,"Hmm, if I could go back to April of '85 instea..."
c61pckg,PzGren,dayz,1.0,0.0,1.346458e+09,t1_c61p0h5,"nooo, reddit wont let me post!"
c61pcki,beercan_dan,tattoos,1.0,0.0,1.346458e+09,t3_z05s0,who was the artist?
...,...,...,...,...,...,...,...
c67p37n,mollaby38,todayilearned,16.0,0.0,1.347560e+09,t1_c67ook6,It's true that most planets in the universe lo...
c67p37o,Hakkz,Diablo,-4.0,0.0,1.347560e+09,t1_c67neuy,My demon hunter can farm act 3 and I only spen...
c67p37p,reallifeminifig,AskReddit,3.0,0.0,1.347560e+09,t1_c67ljtd,I've always found if you're in a rut in life a...
c67p37q,Black-Epiphany,movies,2.0,0.0,1.347560e+09,t1_c67nvv3,This is the best horror movie in my opinion. S...


In [90]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# function to remove punctuation from text (input is a string)
def clean_text(s):
	s = "".join(l for l in s if l not in string.punctuation)
    




In [70]:
def get_sub_comments(pol_users, comments, subs):
    sub_users = {sub: pol_users[pol_users['political_label'] == sub].index for sub in subs}
    pcids = {sub: [] for sub in subs}
    for comment in tqdm(comments.itertuples()):
        for sub in subs:
            if getattr(comment, 'author') in sub_users[sub] and getattr(comment, 'subreddit') == sub:
                pcids[sub].append(getattr(comment, 'Index'))
    return pcids

In [71]:
sub_comments = get_sub_comments(pol_users, comments, ['Conservative', 'Liberal'])
sub_comments

10000001it [00:37, 264747.70it/s]


{'Conservative': ['c61pd0g',
  'c61pddi',
  'c61pfvt',
  'c61phzi',
  'c61pi1j',
  'c61piuv',
  'c61pj53',
  'c61pjg0',
  'c61pjok',
  'c61pk7m',
  'c61pkqt',
  'c61plal',
  'c61pltt',
  'c61pmo4',
  'c61pnxy',
  'c61pqhh',
  'c61pqqc',
  'c61prbz',
  'c61prf5',
  'c61psoz',
  'c61pvk6',
  'c61pvmo',
  'c61pwbj',
  'c61pww4',
  'c61pxqk',
  'c61py67',
  'c61py7a',
  'c61pyou',
  'c61pyxg',
  'c61pz4t',
  'c61q058',
  'c61q1wy',
  'c61q2q2',
  'c61q3h6',
  'c61q3rc',
  'c61q3sm',
  'c61q4bx',
  'c61q4cu',
  'c61q60d',
  'c61q60p',
  'c61q6zy',
  'c61q83s',
  'c61q890',
  'c61qasc',
  'c61qbjs',
  'c61qbzr',
  'c61qd91',
  'c61qdon',
  'c61qf7b',
  'c61qgf7',
  'c61qh2q',
  'c61qh32',
  'c61qj1n',
  'c61qjlm',
  'c61qly4',
  'c61qou2',
  'c61qpke',
  'c61qtgp',
  'c61qtkb',
  'c61qwv8',
  'c61qy1d',
  'c61qzia',
  'c61r5ci',
  'c61r6ju',
  'c61r6yp',
  'c61r8mw',
  'c61r97x',
  'c61r9x7',
  'c61ramo',
  'c61rbh8',
  'c61rcnx',
  'c61rdaw',
  'c61rdsb',
  'c61reno',
  'c61rg1n',
  'c61rg7

In [74]:
len(sub_comments['Conservative'])

10507

In [99]:
def get_bow_models(sub_comments):
    models = dict()
    for sub, cids in sub_comments.items():
        corpus = comments.loc[cids]['body'].apply(clean_text)
        model = TfidfVectorizer()
        model.fit(corpus)
        models[sub] = model
    return models

In [100]:
bow_models = get_bow_models(sub_comments)

In [101]:
bow_models

{'Conservative': TfidfVectorizer(), 'Liberal': TfidfVectorizer()}

In [None]:
# Train count vectorizer on just political keywords, do cosine similarity to get "politicalness" score?
# train vectorizer on just pairs and then do cosine similarity?

In [152]:
def clean_text(s):
	return "".join(l for l in s if l not in string.punctuation)

def build_political_user_reply_graph(comments, users, bow_models):
    pol_users = dict()
    for i, user in users.iterrows():
        if not pd.isnull(user['political_label']):
            pol_users[i] = user['political_label']
    G = nx.DiGraph()
    for comment in tqdm(comments.itertuples()):
        i = getattr(comment, 'Index')
        user1 = getattr(comment, 'author')
        if user1 not in pol_users:
            continue
        typed_parent_id = getattr(comment, 'parent_id')
        if not typed_parent_id.startswith('t1_'): # Comment
            continue
        parent_id = typed_parent_id[3:]
        if parent_id not in comments.index:
            continue
        parent = comments.loc[parent_id]
        user2 = parent['author']
        sub = getattr(comment, 'subreddit')
        
        label = pol_users[user1]
        model = bow_models[label]
        
        body = getattr(comment, 'body')
        clean = clean_text(body)
        
        parent_body = parent['body']
        parent_clean = clean_text(parent_body)
        
        bows = model.transform([clean, parent_clean]).toarray() # TODO: operate on sparse version (no toarray)?
        bow = bows[0]
        parent_bow = bows[1]
        
        similarity = cosine_similarity(bows)[1][0]
        pol_score = sum(bow) / len(bow)
        
        if G.has_edge(user1, user2):
            G[user1][user2]['weight'] += 1
            G[user1][user2]['subreddits'].add(sub)
            G[user1][user2]['similarities'].append(similarity)
            G[user1][user2]['politicalities'].append(pol_score)
        else:
            G.add_edge(user1, user2, weight=1, subreddits=set([sub]), similarities=[similarity], politicalities=[pol_score])
    for node in tqdm(G.nodes):
        G.nodes[node]['political_label'] = users.loc[node]['political_label']
    for a, b in tqdm(G.edges):
        G[a][b]['subreddits'] = ",".join(G[a][b]['subreddits'])
        G[a][b]['avg_similarity'] = np.average(G[a][b]['similarities'])
        G[a][b]['avg_politicality'] = np.average(G[a][b]['politicalities'])
    return G

In [153]:
G = build_political_user_reply_graph(comments, pol_users, bow_models)

10000001it [06:29, 25650.72it/s]
100%|█████████████████████████████████████████████████████████████████████████| 30084/30084 [00:02<00:00, 14258.60it/s]
100%|█████████████████████████████████████████████████████████████████████████| 43300/43300 [00:01<00:00, 38206.18it/s]


In [154]:
for a, b in tqdm(G.edges):
    del G[a][b]['similarities']
    del G[a][b]['politicalities']
nx.readwrite.gexf.write_gexf(G, './bow_reply_network_2012.gexf')

100%|████████████████████████████████████████████████████████████████████████| 43300/43300 [00:00<00:00, 283719.46it/s]


In [155]:
groups = pol_users['political_label'].unique()
group_mtx = pd.DataFrame(index=groups, columns=groups)
for group in groups:
    for group2 in groups:
        group_mtx.at[group, group2] = 0
group_mtx

Unnamed: 0,NaN,Conservative,Liberal
,0,0,0
Conservative,0,0,0
Liberal,0,0,0


In [156]:
for a, b in G.edges:
    i = G.nodes[a]['political_label']
    col = G.nodes[b]['political_label']
    group_mtx.at[i, col] += 1

In [157]:
group_mtx['total'] = group_mtx.sum(axis=1)
group_mtx = group_mtx[group_mtx.index.notnull()]
group_mtx

Unnamed: 0,NaN,Conservative,Liberal,total
Conservative,30285,4647,156,35088.0
Liberal,7811,167,234,8212.0


In [158]:
for group in groups:
    group_mtx[f'{group}_percent'] = group_mtx[group].divide(group_mtx['total'])
group_mtx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_mtx[f'{group}_percent'] = group_mtx[group].divide(group_mtx['total'])


Unnamed: 0,NaN,Conservative,Liberal,total,nan_percent,Conservative_percent,Liberal_percent
Conservative,30285,4647,156,35088.0,0.863116,0.132438,0.004446
Liberal,7811,167,234,8212.0,0.951169,0.020336,0.028495


In [162]:
sims = []
pols = []
for a, b, data in G.edges(data=True):
    sims.append(data['avg_similarity'])
    pols.append(data['avg_politicality'])
sims.sort()
pols.sort()
first_quartile_sim = sims[len(sims) // 20]
last_quartile_pol = pols[19 * (len(pols) // 20)]
sus_comments = []
for a, b, data in G.edges(data=True):
    if data['avg_similarity'] <= first_quartile_sim and data['avg_politicality'] >= last_quartile_pol:
        sus_comments.append((a, b, data))
sus_subs = set()
s_list = []
for a, b, data in sus_comments:
    sus_subs = sus_subs | set(data['subreddits'].split(','))
    s_list.append((data['subreddits'], G.nodes[a]['political_label']))

In [195]:
sus_subs

{'AlisonBrie',
 'Android',
 'AskReddit',
 'Assistance',
 'BSG',
 'Bitcoin',
 'CFB',
 'Colombia',
 'Cooking',
 'Dirtbikes',
 'Eve',
 'Foodforthought',
 'GaryJohnson',
 'IAmA',
 'Liberal',
 'Planetside',
 'PoliticalDiscussion',
 'TrueAtheism',
 'TrueReddit',
 'TwoXChromosomes',
 'WTF',
 'WeAreTheMusicMakers',
 'askscience',
 'atheism',
 'atheismplus',
 'breakingbad',
 'community',
 'conspiracy',
 'gaming',
 'houston',
 'news',
 'pics',
 'politics',
 'programming',
 'rstats',
 'skeptic',
 'stopsmoking',
 'technology',
 'todayilearned',
 'videos',
 'worldnews'}

In [196]:
len(sus_subs)

41

In [197]:
s_list

[('politics', 'Liberal'),
 ('technology', 'Liberal'),
 ('politics', 'Liberal'),
 ('politics', 'Liberal'),
 ('technology', 'Liberal'),
 ('WTF', 'Liberal'),
 ('politics', 'Liberal'),
 ('Foodforthought', 'Liberal'),
 ('WeAreTheMusicMakers', 'Liberal'),
 ('politics', 'Liberal'),
 ('gaming', 'Liberal'),
 ('politics', 'Liberal'),
 ('gaming', 'Liberal'),
 ('politics', 'Liberal'),
 ('programming', 'Liberal'),
 ('worldnews', 'Liberal'),
 ('AskReddit', 'Liberal'),
 ('todayilearned', 'Liberal'),
 ('conspiracy', 'Liberal'),
 ('Liberal', 'Liberal'),
 ('Bitcoin', 'Liberal'),
 ('politics', 'Liberal'),
 ('community', 'Liberal'),
 ('community', 'Liberal'),
 ('politics', 'Liberal'),
 ('politics', 'Liberal'),
 ('Eve', 'Liberal'),
 ('CFB', 'Liberal'),
 ('CFB', 'Liberal'),
 ('CFB', 'Liberal'),
 ('CFB', 'Liberal'),
 ('Android', 'Liberal'),
 ('BSG', 'Liberal'),
 ('WTF', 'Liberal'),
 ('WTF', 'Liberal'),
 ('Liberal', 'Liberal'),
 ('GaryJohnson', 'Liberal'),
 ('Liberal', 'Liberal'),
 ('conspiracy', 'Liberal'),


In [202]:
sims = []
pols = []
for a, b, data in G.edges(data=True):
    if G.nodes[a]['political_label'] == 'Conservative':
        sims.append(data['avg_similarity'])
        pols.append(data['avg_politicality'])
sims.sort()
pols.sort()
first_quartile_sim = sims[len(sims) // 20]
last_quartile_pol = pols[19 * (len(pols) // 20)]
sus_comments = []
for a, b, data in G.edges(data=True):
    if G.nodes[a]['political_label'] == 'Conservative' and data['avg_similarity'] <= first_quartile_sim and data['avg_politicality'] >= last_quartile_pol:
        sus_comments.append((a, b, data))
sus_subs = set()
s_list = []
for a, b, data in sus_comments:
    sus_subs = sus_subs | set(data['subreddits'].split(','))
    s_list.append((data['subreddits'], G.nodes[a]['political_label']))

In [203]:
sus_subs

{'Conservative',
 'DepthHub',
 'Fitness',
 'PoliticalDiscussion',
 'TwoXChromosomes',
 'VideoEditing',
 'WeAreTheMusicMakers',
 'news',
 'politics',
 'psychology',
 'self',
 'worldnews'}

In [204]:
len(sus_subs)

12

In [205]:
s_list

[('Fitness', 'Conservative'),
 ('VideoEditing', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('news', 'Conservative'),
 ('psychology', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('TwoXChromosomes', 'Conservative'),
 ('self', 'Conservative'),
 ('politics', 'Conservative'),
 ('worldnews', 'Conservative'),
 ('TwoXChromosomes', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('DepthHub', 'Conservative'),
 ('WeAreTheMusicMakers', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('PoliticalDiscussion', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('Conservative', 'Conservative'),
 ('Conservative', 'Conservative')]