In [2]:
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

import csv
import itertools
import json
import psaw
import time
from tqdm import tqdm

import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from timeit import default_timer as timer
from pathlib import Path
from calendar import Calendar
import calendar

In [4]:
def epoch(year, month, day, **kwargs):
    date_time = dt.datetime(year, month, day, **kwargs)
    return int(date_time.timestamp())


def dataframe(psaw_result_generator):
    return pd.DataFrame([item.d_ for item in psaw_result_generator])


# Wrapper subclass to return results in Pandas DataFrames.
class DataframePushshiftAPI(psaw.PushshiftAPI):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def search_comments(self, **kwargs):
        result_gen = super().search_comments(**kwargs)
        return dataframe(result_gen)
    
    def search_submissions(self, **kwargs):
        result_gen = super().search_submissions(**kwargs)
        return dataframe(result_gen)
    
    # Subreddit endpoint is not working (https://github.com/pushshift/api/issues/40).
    # def search_subreddits(self, **kwargs):
    #     result_gen = self._search_func(kind='subreddit', **kwargs)
    #     return dataframe(result_gen)
    
    def redditor_subreddit_activity(self, author, **kwargs):
        result_gen = super().redditor_subreddit_activity(author, **kwargs)
        return dataframe(result_gen)

    
# Fast group by subreddit
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
#
# expects df has two columns, first 'author,' then subreddit
def group_subreddits_by_author(df):
    keys, values = df.sort_values('author').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:]) # subreddit must be 2nd col
    return pd.DataFrame({
        'author': ukeys,
        'subreddits': [set(a) for a in arrays]
    })


# expects df has two columns, first 'author,' then subreddit
def build_subreddit_shared_author_graph(df):
    grouped_by_sub = group_subreddits_by_author(df)
    G = nx.Graph()
    for shared_subs in grouped_by_sub['subreddits']:
        for sub1, sub2 in itertools.combinations(shared_subs, 2):
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += 1
            else:
                G.add_edge(sub1, sub2, weight=1)
    return G


def export_to_gephi_file(G, file_path):
    with open(file_path, 'w') as f:
        for line in nx.generate_gexf(G):
            f.write(line + '\n')
            

def label_users(df, pol_subs):
    gp_by_sub = group_subreddits_by_author(df[['author', 'subreddit']])
    for sub in pol_subs:
        other_subs = {s for s in pol_subs if s != sub}
        for i, row in gp_by_sub.iterrows():
            ss = row['subreddits']
            if sub in ss and ss.isdisjoint(other_subs):
                gp_by_sub.at[i, 'political_label'] = sub
    return gp_by_sub

In [5]:
pushshift = DataframePushshiftAPI()

In [56]:
def download_comments(after, before, limit, **kwargs):
    print(f'Downloading data ({kwargs})...')
    start = timer()
    df = pushshift.search_comments(after=after,
                                   before=before,
                                   **kwargs,
                                   sort='asc',
                                   sort_type='created_utc',
                                   filter=['author', 'subreddit'],
                                   limit=limit)
    end = timer()
    print('Finished!')
    print(f'Time elapsed: {end - start}s')
    return df


def download_subreddit_users(after, before, limit, subreddit_name):
    return download_comments(after, before, limit, subreddit=subreddit_name)


def download_user_comments(after, before, limit, author):
    return download_comments(after, before, limit, author=author)


def load_comments_from_files(file_paths):
    dfs = []
    for file_path in file_paths:
        dfs.append(load_comments_from_file(file_path))
    return pd.concat(dfs)


def load_comments_from_file(file_path, limit=None):
    keys_to_keep = ['author', 'subreddit', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
    data = []
    with open(file_path, 'r') as f:
        count = 0
        for line in tqdm(f):
            try:
                j = json.loads(line)
                record = { k: j[k] for k in keys_to_keep }
                data.append(record)
                count += 1
                if limit and count > limit:
                    break
            except json.JSONDecodeError:
                break
    df = pd.json_normalize(data)
    df[['score', 'controversiality', 'created_utc']] = df[['score', 'controversiality', 'created_utc']].apply(pd.to_numeric, downcast="float")
    return df

# function to remove punctuation from text (input is a string)
def clean_text(s):
	return "".join(l for l in str(s) if l not in string.punctuation)
    
def get_sub_comments(pol_users, comments, subs):
    sub_users = {sub: pol_users[pol_users['political_label'] == sub].index for sub in subs}
    pcids = {sub: [] for sub in subs}
    for comment in tqdm(comments.itertuples()):
        for sub in subs:
            if getattr(comment, 'author') in sub_users[sub] and getattr(comment, 'subreddit') == sub:
                pcids[sub].append(getattr(comment, 'Index'))
    return pcids

In [20]:
def save_comments_by_day(src, dest, month, year = 2012):
    num_days = calendar.monthrange(year, month)[1]
    day_starts = []
    day_files = []
    day_writers = []
    keys_to_keep = ['author', 'subreddit', 'score', 'controversiality', 'created_utc', 'id', 'parent_id', 'body']
     
    for day in range(1, num_days + 1):
        start_of_day = int(dt.datetime(year, month, day, tzinfo=dt.timezone.utc).timestamp())
        print(start_of_day)
        day_starts.append(start_of_day)
        day_file = open(f'{dest}{day}.csv', 'w', newline='')
        day_files.append(day_file)
        day_writer = csv.writer(day_file, delimiter=',', quotechar='"')
        day_writer.writerow(keys_to_keep)
        day_writers.append(day_writer)
    
    with open(src, 'r') as f:
        for line in tqdm(f):
            try:
                j = json.loads(line)
                j['created_utc'] = int(j['created_utc'])
                i = len(day_starts) - 1
                while (j['created_utc'] < day_starts[i]):
                    i = i - 1
                record = [ j[k] for k in keys_to_keep ]
                day_writers[i].writerow(record)
            except json.JSONDecodeError:
                continue
    
    for day_file in day_files:
        day_file.close()            

In [21]:
save_comments_by_day('./data/RC_2012-09', './data/RC_2012-09_daily/', 9)

1346457600
1346544000
1346630400
1346716800
1346803200
1346889600
1346976000
1347062400
1347148800
1347235200
1347321600
1347408000
1347494400
1347580800
1347667200
1347753600
1347840000
1347926400
1348012800
1348099200
1348185600
1348272000
1348358400
1348444800
1348531200
1348617600
1348704000
1348790400
1348876800
1348963200


23419524it [07:46, 50209.89it/s]


In [22]:
for comment_file_name, dest, month in [
    ('./data/RC_2012-10', './data/RC_2012-10_daily/', 10),
    ('./data/RC_2012-11', './data/RC_2012-11_daily/', 11)
]:
    save_comments_by_day(comment_file_name, dest, month)

1349049600
1349136000
1349222400
1349308800
1349395200
1349481600
1349568000
1349654400
1349740800
1349827200
1349913600
1350000000
1350086400
1350172800
1350259200
1350345600
1350432000
1350518400
1350604800
1350691200
1350777600
1350864000
1350950400
1351036800
1351123200
1351209600
1351296000
1351382400
1351468800
1351555200
1351641600


24788236it [07:32, 54803.91it/s]


1351728000
1351814400
1351900800
1351987200
1352073600
1352160000
1352246400
1352332800
1352419200
1352505600
1352592000
1352678400
1352764800
1352851200
1352937600
1353024000
1353110400
1353196800
1353283200
1353369600
1353456000
1353542400
1353628800
1353715200
1353801600
1353888000
1353974400
1354060800
1354147200
1354233600


24648302it [10:00, 41055.21it/s]


In [41]:
comments = pd.DataFrame()
for i in tqdm(range(1, 32)):
    try:
        comments = comments.append(pd.read_csv(f'./data/RC_2012-09_daily/{i}.csv'))
    except FileNotFoundError:
        pass
comments.set_index('id', inplace=True)
comments

100%|█████████████████████████████████████████████████████████████| 31/31 [03:06<00:00,  6.03s/it]


Unnamed: 0_level_0,author,subreddit,score,controversiality,created_utc,parent_id,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c61pckd,[deleted],AskReddit,1,0,1346457600,t3_z5u9q,[deleted]
c61pcke,tomcat0071,gifs,1,0,1346457600,t3_z4zfj,Where does a 500lb (227 kg) cat sit?\n\nWhere ...
c61pckf,ronearc,AskReddit,1,0,1346457600,t3_z4rqt,"Hmm, if I could go back to April of '85 instea..."
c61pckg,PzGren,dayz,1,0,1346457600,t1_c61p0h5,"nooo, reddit wont let me post!"
c61pcki,beercan_dan,tattoos,1,0,1346457600,t3_z05s0,who was the artist?
...,...,...,...,...,...,...,...
c6fqq4m,dudechris88,battlefield3,1,0,1349049599,t1_c6fqfko,&gt; I do maintain though that on average you ...
c6fqq4n,[deleted],AskReddit,3,0,1349049599,t3_10pkrr,Gel in my hair.\n\nBadly.
c6fqq4o,acampbell28,AskReddit,1,0,1349049599,t3_10pyf0,The Groove Train Apostles
c6fqq4p,enterthebored,learnprogramming,2,0,1349049599,t3_10qg3w,"You're on the right track, but why are you set..."


In [42]:
pol_subs = ['Conservative', 'Liberal']
pol_users = label_users(comments, set(pol_subs))
pol_users.set_index('author', inplace=True)

In [43]:
sub_comments = get_sub_comments(pol_users, comments, pol_subs)
sub_comments

23419524it [01:16, 307506.45it/s]


{'Conservative': ['c61pd0g',
  'c61pddi',
  'c61pfvt',
  'c61phzi',
  'c61pi1j',
  'c61piuv',
  'c61pj53',
  'c61pjg0',
  'c61pjok',
  'c61pk7m',
  'c61pkqt',
  'c61plal',
  'c61pltt',
  'c61pmo4',
  'c61pnxy',
  'c61pqhh',
  'c61pqqc',
  'c61prbz',
  'c61prf5',
  'c61psoz',
  'c61pvk6',
  'c61pvmo',
  'c61pwbj',
  'c61pww4',
  'c61pxqk',
  'c61py67',
  'c61py7a',
  'c61pyou',
  'c61pyxg',
  'c61pz4t',
  'c61q058',
  'c61q1wy',
  'c61q2q2',
  'c61q3h6',
  'c61q3rc',
  'c61q3sm',
  'c61q4bx',
  'c61q4cu',
  'c61q60d',
  'c61q60p',
  'c61q6zy',
  'c61q83s',
  'c61q890',
  'c61qasc',
  'c61qbjs',
  'c61qbzr',
  'c61qd91',
  'c61qdon',
  'c61qf7b',
  'c61qgf7',
  'c61qh2q',
  'c61qh32',
  'c61qj1n',
  'c61qjlm',
  'c61qly4',
  'c61qou2',
  'c61qpke',
  'c61qtgp',
  'c61qtkb',
  'c61qwv8',
  'c61qy1d',
  'c61qzia',
  'c61r5ci',
  'c61r6ju',
  'c61r6yp',
  'c61r8mw',
  'c61r97x',
  'c61r9x7',
  'c61ramo',
  'c61rbh8',
  'c61rcnx',
  'c61rdaw',
  'c61rdsb',
  'c61reno',
  'c61rg1n',
  'c61rg7

In [45]:
def get_bow_models(sub_comments):
    models = dict()
    for sub, cids in sub_comments.items():
        corpus = comments.loc[cids]['body'].apply(clean_text)
        model = TfidfVectorizer()
        model.fit(corpus)
        models[sub] = model
    return models

In [46]:
bow_models = get_bow_models(sub_comments)

In [47]:
bow_models

{'Conservative': TfidfVectorizer(), 'Liberal': TfidfVectorizer()}

In [85]:
sub_limit = 5
subs_of_interest = pd.read_csv('./subreddits_of_interest.csv')
subs_of_interest = set(subs_of_interest.sort_values('submission_amount', ascending=False)['subreddit'].head(sub_limit))
subs_of_interest = subs_of_interest | set(pol_subs)
subs_of_interest

{'AskReddit', 'Conservative', 'Liberal', 'funny', 'gaming', 'pics', 'trees'}

In [89]:
pol_users['political_label'] = pol_users['political_label'].fillna('unaffiliated')
pol_users

Unnamed: 0_level_0,subreddits,political_label
author,Unnamed: 1_level_1,Unnamed: 2_level_1
---,"{gonewild, Frisson, legaladvice, DoesAnybodyEl...",unaffiliated
----------------,"{sandiego, philadelphia}",unaffiliated
----0000----,{BSD},unaffiliated
----ThisIsTheLine---,{funny},unaffiliated
----_----,"{conspiratard, ECE, science, skeptic, conspira...",unaffiliated
...,...,...
zzzzz_,{Amsterdam},unaffiliated
zzzzzamm,{flying},unaffiliated
zzzzzgh,{worldnews},unaffiliated
zzzzzzWithSevenZs,{90s},unaffiliated


In [93]:
def build_political_user_reply_graph(comments, users, bow_models, subs_of_interest):
    pol_users = dict()
    for i, user in users.iterrows():
        if not user['political_label'] == 'unaffiliated':
            pol_users[i] = user['political_label']
    print(pol_users)
    G = nx.DiGraph()
    for comment in tqdm(comments.itertuples()):
        i = getattr(comment, 'Index')
        user1 = getattr(comment, 'author')
        sub = getattr(comment, 'subreddit')
        if sub not in subs_of_interest:
            continue
        typed_parent_id = getattr(comment, 'parent_id')
        if not typed_parent_id.startswith('t1_'): # Comment
            continue
        parent_id = typed_parent_id[3:]
        if parent_id not in comments.index:
            continue
        parent = comments.loc[parent_id]
        user2 = parent['author']
        
        for sub, model in bow_models.items():
            body = getattr(comment, 'body')
            clean = clean_text(body)

            parent_body = parent['body']
            parent_clean = clean_text(parent_body)

            bows = model.transform([clean, parent_clean]).toarray() # TODO: operate on sparse version (no toarray)?
            bow = bows[0]
            parent_bow = bows[1]

            similarity = cosine_similarity(bows)[1][0]
            pol_score = sum(bow) / len(bow)
            
            sim_key = f'{sub}_sim'
            pol_key = f'{sub}_pol'
            if G.has_edge(user1, user2):
                if sim_key in G[user1][user2]:
                    G[user1][user2][sim_key].append(similarity)
                else:
                    G[user1][user2][sim_key] = [similarity]
                if pol_key in G[user1][user2]:                    
                    G[user1][user2][pol_key].append(pol_score)
                else:
                    G[user1][user2][pol_key] = [pol_score]
            else:
                G.add_edge(user1, user2, weight=0, subreddits=[], **{f'{sub}_sim': [similarity], f'{sub}_pol': [pol_score]})
            
        if G.has_edge(user1, user2):
            G[user1][user2]['weight'] += 1
            G[user1][user2]['subreddits'].append(sub)
        else:
            G.add_edge(user1, user2, weight=1, subreddits=set([sub]))
            
    for node in tqdm(G.nodes):
        G.nodes[node]['political_label'] = users.at[node, 'political_label']
    for a, b in tqdm(G.edges):
        G[a][b]['subreddits'] = ",".join(G[a][b]['subreddits'])
        for sub in bow_models.keys():
            G[a][b][f'avg_{sub}_sim'] = np.average(G[a][b][f'{sub}_sim'])
            G[a][b][f'avg_{sub}_pol'] = np.average(G[a][b][f'{sub}_pol'])
    return G

In [90]:
del comments

NameError: name 'comments' is not defined

In [None]:
for i in tqdm(range(1, 32)):
    try:
        comments = pd.read_csv(f'./data/RC_2012-09_daily/{i}.csv')
        comments.set_index('id', inplace=True)
        G = build_political_user_reply_graph(comments, pol_users, bow_models, subs_of_interest)
        for a, b in tqdm(G.edges):
            for sub in bow_models.keys():
                del G[a][b][f'{sub}_sim']
                del G[a][b][f'{sub}_pol']
        nx.readwrite.gexf.write_gexf(G, f'./data/RC_2012-09_daily/{i}.gexf')
    except FileNotFoundError:
        pass

  0%|                                                                      | 0/31 [00:00<?, ?it/s]




0it [00:00, ?it/s][A
13it [00:00, 112.08it/s][A
1245it [00:00, 6776.29it/s][A
2103it [00:00, 7436.20it/s][A
2869it [00:00, 6804.41it/s][A
3570it [00:00, 6124.83it/s][A
4322it [00:00, 6499.12it/s][A
4990it [00:00, 5797.88it/s][A
5592it [00:00, 5107.74it/s][A
6127it [00:01, 5029.04it/s][A
6645it [00:01, 4701.22it/s][A
7251it [00:01, 5037.08it/s][A
7769it [00:01, 4617.70it/s][A
8456it [00:01, 5136.96it/s][A
8986it [00:01, 4980.91it/s][A
9499it [00:01, 4954.42it/s][A
10002it [00:01, 4439.11it/s][A
10459it [00:02, 4404.63it/s][A
10908it [00:02, 4099.12it/s][A
11326it [00:02, 3974.94it/s][A
11729it [00:02, 3844.51it/s][A
12260it [00:02, 4169.33it/s][A
12684it [00:02, 4165.66it/s][A
13104it [00:02, 4106.35it/s][A
13517it [00:02, 3893.53it/s][A
14037it [00:02, 4219.12it/s][A
14463it [00:03, 4026.93it/s][A
14870it [00:03, 3368.35it/s][A
15226it [00:03, 3349.24it/s][A
15628it [00:03, 3495.27it/s][A
15989it [00:03, 3200.37it/s][A
16320it [00:03, 3220.26it/s][A
166