# Load Attribute List

In [111]:
import pandas as pd

edges = pd.read_csv('../data/finalEdgeList.csv')
ex_attr = pd.read_csv('../data/mergedAttributeList.csv')

# Construct Attribute List

In [113]:
source_reddits = set(edges.source.unique())
target_reddits = set(edges.target.unique())
all_reddits = source_reddits.union(target_reddits)
df = pd.DataFrame(all_reddits, columns=['reddit_name'])

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1866 entries, 0 to 1865
Data columns (total 1 columns):
reddit_name    1866 non-null object
dtypes: object(1)
memory usage: 14.7+ KB


# Fetch Moderators and Subscribers

In [92]:
import sys
sys.path.append('../src/')

import reddit
import numpy as np

In [116]:
def fetch_attributes(reddit_name):
    try:
        subreddit = reddit.reddit_api_wrapper.subreddit(reddit_name)
        subs = subreddit.subscribers
        mods = [mod.name for mod in subreddit.moderator()]
        desc = subreddit.public_description
        full_desc = subreddit.description
        created_utc = subreddit.created_utc

        return (subs, mods, desc, full_desc, created_utc, np.nan)
    except KeyboardInterrupt:
        raise
    except:
        print('Error with:', reddit_name)
        e = sys.exc_info()[0]
        print(e)
        return (np.nan, np.nan, np.nan, np.nan, np.nan, e)

In [117]:
results = df.reddit_name.map(fetch_attributes)

Error with: reddit
<class 'prawcore.exceptions.NotFound'>
Error with: espaã
<class 'prawcore.exceptions.NotFound'>
Error with: argentinahub
<class 'prawcore.exceptions.Redirect'>
Error with: gendercide
<class 'prawcore.exceptions.NotFound'>
Error with: discerning
<class 'prawcore.exceptions.Forbidden'>
Error with: formato
<class 'prawcore.exceptions.Forbidden'>
Error with: independiente
<class 'prawcore.exceptions.Forbidden'>
Error with: rpg_brasilespaã
<class 'prawcore.exceptions.NotFound'>
Error with: lgbtq_de_argentina
<class 'prawcore.exceptions.Redirect'>
Error with: soprtrecife
<class 'prawcore.exceptions.Redirect'>
Error with: sã
<class 'prawcore.exceptions.NotFound'>
Error with: oxfordunitedfc
<class 'prawcore.exceptions.Forbidden'>
Error with: edurne
<class 'prawcore.exceptions.Forbidden'>
Error with: reportthespammers
<class 'prawcore.exceptions.Forbidden'>
Error with: calcio
<class 'prawcore.exceptions.Forbidden'>
Error with: modtalk
<class 'prawcore.exceptions.Forbidden'>
E

In [139]:
import pickle

with open('../data/attrScrapeResults.picle', 'wb') as write_file:
    pickle.dump(results, write_file)

# Add to Attribute File

In [125]:
df['subs'] = results.map(lambda x: x[0], na_action='ignore')
df['moderators'] = results.map(lambda x: x[1], na_action='ignore')
df['desc'] = results.map(lambda x: x[2], na_action='ignore')
df['full_desc'] = results.map(lambda x: x[3], na_action='ignore')
df['created_utc'] = results.map(lambda x: x[4], na_action='ignore')
df['error'] = results.map(lambda x: x[5], na_action='ignore')

In [127]:
df['mod_count'] = df['moderators'].map(lambda x: len(x), na_action='ignore')

# Remove Private Reddits

In [131]:
valid_reddits = df[df.error.isna()]
invalid_reddits = df[df.error.notna()]

In [132]:
print(valid_reddits.shape)
print(invalid_reddits.shape)

(1812, 9)
(54, 9)


# Make Edge List Match

In [133]:
valid_edges = edges[edges.target.isin(valid_reddits.reddit_name) & edges.source.isin(valid_reddits.reddit_name)]

In [135]:
valid_edges.head()

Unnamed: 0.1,Unnamed: 0,source,source_type,target,target_type,edge_type,context,multi_name
0,0,latinoamerica,r,latinoamerica,r,desc,,
1,1,latinoamerica,r,argentina,r,desc,merica</li>\n<li>Images macros</li>\n<li>Posts...,
2,2,latinoamerica,r,bolivia,r,desc,quot;Yo cuando...&quot;</li>\n</ul>\n\n<hr/>\n...,
3,3,latinoamerica,r,brasil,r,desc,latinoamericanos</strong></p>\n\n<ul>\n<li><a ...,
4,4,latinoamerica,r,chile,r,desc,"dit.com/r/argentina"">r/argentina</a></li>\n<li...",


# Save Final CSVs

In [136]:
valid_reddits[['reddit_name', 'desc', 'subs', 'mod_count']].to_csv('../data/finalAttributeList.csv')
valid_edges[['source', 'target', 'edge_type', 'context', 'multi_name']].to_csv('../data/finalValidEdgeList.csv')
invalid_reddits[['reddit_name', 'error']].to_csv('../data/removedReddits.csv')