In [1]:
import datetime as dt
import praw
from psaw import PushshiftAPI

In [2]:
reddit = praw.Reddit(
    client_id='NEuQE23gPSC5QA', client_secret='pVJemwClzr77juCfn3M65vsAYDc',
    user_agent='research', username='notNotLurking', password='Bebrave!Havefun!'
)
api = PushshiftAPI(reddit)

Grab submissions... this can take a while... like 15s/1k submissions

In [3]:
submissions = list(api.search_submissions(
    subreddit='politics',
    limit=10_000))

print("We grabbed %s submissions" % len(submissions))

We grabbed 10000 submissions


Let's figure out which attributes each exist on at least one of the submissions, and are not private.

In [4]:
attributes = set()
for s in submissions:
    attributes.update(vars(s).keys())
public_attributes = sorted(s for s in attributes if s[0] != "_")

How many are there?

In [5]:
len(public_attributes)

112

Let's see how many times each one that doesn't appear every time appears

In [6]:
from collections import Counter
counter = Counter()
for s in submissions:
    counter.update(vars(s).keys())
sorted((t for t in counter.items() if t[0] in public_attributes and t[1] < len(submissions)), key=lambda t:t[1])

[('event_start', 3),
 ('collections', 3),
 ('event_end', 3),
 ('event_is_live', 3),
 ('author_cakeday', 86),
 ('crosspost_parent_list', 237),
 ('crosspost_parent', 237),
 ('link_flair_template_id', 2181),
 ('post_hint', 4984),
 ('preview', 4984),
 ('author_fullname', 8168),
 ('author_flair_richtext', 8168),
 ('author_flair_type', 8168),
 ('author_patreon_flair', 8168)]

In [7]:
len(_)

14

Let's see which attributes never appear with any value other than None

In [8]:
def none_only_attribute(attribute):
    for s in submissions:
        if attribute in vars(s).keys() and getattr(s, attribute) != None:
            return False
    return True

none_only_attributes = [a for a in public_attributes if none_only_attribute(a)]
none_only_attributes

['approved_at_utc',
 'approved_by',
 'banned_at_utc',
 'banned_by',
 'category',
 'content_categories',
 'discussion_type',
 'likes',
 'mod_note',
 'mod_reason_by',
 'mod_reason_title',
 'num_reports',
 'removal_reason',
 'report_reasons',
 'view_count']

In [9]:
len(none_only_attributes)

15

Which attributes are not always present, but when present, always have the value None? (This may be an empty set.)

In [10]:
set(none_only_attributes) - set(a for a in none_only_attributes if all(hasattr(s,a) for s in submissions))

set()

In [11]:
def value_for_attribute(attribute):
    '''Try to find a non-None value for the given attribute'''
    for s in submissions:
        if attribute in vars(s).keys() and getattr(s, attribute) != None:
            return getattr(s, attribute)
    return None

In [12]:
def value_for_attribute_nonfalsy(attribute):
    '''Try to find a non-Falsy value for the given attribute'''
    for s in submissions:
        if attribute in vars(s).keys() and getattr(s, attribute):
            return getattr(s, attribute)
    return value_for_attribute(attribute)

Which types are taken on by the values of all the public attributes?

In [13]:
set(type(value_for_attribute(a)) for a in public_attributes)

{NoneType,
 bool,
 dict,
 float,
 int,
 list,
 praw.models.reddit.redditor.Redditor,
 praw.models.reddit.subreddit.Subreddit,
 str}

In [14]:
def attributes_for_type(t):
    return [a for a in public_attributes if type(value_for_attribute(a))==t]

Note the author and subreddit attributes are praw things computed from author_fullname and subreddit_id

Which attributes take on dict values?

In [15]:
dict_attributes = attributes_for_type(dict)
dict_attributes

['gildings',
 'media',
 'media_embed',
 'preview',
 'secure_media',
 'secure_media_embed']

Which attributes take on list values?

In [16]:
list_attributes = attributes_for_type(list)
list_attributes

['all_awardings',
 'author_flair_richtext',
 'awarders',
 'collections',
 'crosspost_parent_list',
 'link_flair_richtext',
 'mod_reports',
 'steward_reports',
 'user_reports']

This code can be used to "sample" interesting values for attributes. It skips Nones and tries to skip empty collections

In [17]:
a = "link_flair_richtext"
v = value_for_attribute_nonfalsy(a)
v if v else value_for_attribute(a)

[{'e': 'text', 't': 'Non-whitelisted domain'}]

This code computes up all (optionally, non-falsy) values taken on by the attribute

In [18]:
non_falsy = True
[getattr(s, a) for s in submissions if a in vars(s).keys() and (not non_falsy or getattr(s, a))]

[[{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'No Social Media'}],
 [{'e': 'text', 't': 'Rule-Breaking Title'}],
 [{'e': 'text', 't': 'Rehosted Content'}],
 [{'e': 'text', 't': 'Rule-Breaking Title'}],
 [{'e': 'text', 't': 'Already Submitted'}],
 [{'e': 'text', 't': 'Already Submitted'}],
 [{'e': 'text', 't': 'Paywall'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'Already Submitted'}],
 [{'e': 'text', 't': 'Rule-Breaking Title'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'No Social Media'}],
 [{'e': 'text', 't': 'Already Submitted'}],
 [{'e': 'text', 't': 'Blogging Platform'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'Rule-Breaking Title'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': 'text', 't': 'Paywall'}],
 [{'e': 'text', 't': 'Non-whitelisted domain'}],
 [{'e': '

I recommend grabbing at least:
- media: the url attribute from the media dict; other subattributes of interest may be author_name, HTML, provider_name, provider_url
- preview: the URL of the highest-resolution preview image
- gildings: assume the coin types are stable and make integer columns, or pickle the entire dict?
- all_awardings: is the list of awards readers can give of any importance?
- author_flair_richtext: this is an array of decorations. Pickle?
- awarders: a list of redditor display names. Pickle?
- collections: big lists of submissions... don't know
- crosspost_parent_list: potentially a lot of info here...
- link_flair_richtext: typically there's one English phrase explaining a reason for submission rejection
- mod_reports, steward_reports, user_reports: these appear to be censored

In [19]:
primitives = [bool, float, int, str]

In [21]:
primitive_attributes = [a for a in public_attributes if type(value_for_attribute(a)) in primitives]
[(a, type(value_for_attribute_nonfalsy(a))) for a in primitive_attributes]

[('allow_live_comments', bool),
 ('archived', bool),
 ('author_cakeday', bool),
 ('author_flair_background_color', str),
 ('author_flair_css_class', str),
 ('author_flair_template_id', str),
 ('author_flair_text', str),
 ('author_flair_text_color', str),
 ('author_flair_type', str),
 ('author_fullname', str),
 ('author_patreon_flair', bool),
 ('can_gild', bool),
 ('can_mod_post', bool),
 ('clicked', bool),
 ('comment_limit', int),
 ('comment_sort', str),
 ('contest_mode', bool),
 ('created', float),
 ('created_utc', float),
 ('crosspost_parent', str),
 ('distinguished', str),
 ('domain', str),
 ('downs', int),
 ('edited', float),
 ('event_end', float),
 ('event_is_live', bool),
 ('event_start', float),
 ('gilded', int),
 ('hidden', bool),
 ('hide_score', bool),
 ('id', str),
 ('is_crosspostable', bool),
 ('is_meta', bool),
 ('is_original_content', bool),
 ('is_reddit_media_domain', bool),
 ('is_robot_indexable', bool),
 ('is_self', bool),
 ('is_video', bool),
 ('link_flair_background_c

In [None]:
def longest_str(attribute):
    return max(len(getattr(s, attribute)) for s in submissions if attribute in vars(s).keys() and getattr(s, attribute))

In [None]:
attribute_lengths = {}
for a in primitive_attributes:
    if type(value_for_attribute(a))==str:
        attribute_lengths[a] = longest_str(a)
attribute_lengths