In [1]:
import datetime as dt
import praw
from psaw import PushshiftAPI

In [2]:
reddit = praw.Reddit(
    client_id='NEuQE23gPSC5QA', client_secret='pVJemwClzr77juCfn3M65vsAYDc',
    user_agent='research', username='notNotLurking', password='Bebrave!Havefun!'
)
api = PushshiftAPI(reddit)

Grab comments... this can take a while... like 15s/1k comments

In [3]:
comments = list(api.search_comments(
    subreddit='politics',
    limit=10_000))

print("We grabbed %s comments" % len(comments))

We grabbed 10000 comments


Let's figure out which attributes each exist on at least one of the comments, and are not private.

In [4]:
attributes = set()
for s in comments:
    attributes.update(vars(s).keys())
public_attributes = sorted(s for s in attributes if s[0] != "_")

How many are there?

In [5]:
len(public_attributes)

62

Let's see how many times each one that doesn't appear every time appears

In [6]:
from collections import Counter
counter = Counter()
for s in comments:
    counter.update(vars(s).keys())
sorted((t for t in counter.items() if t[0] in public_attributes and t[1] < len(comments)), key=lambda t:t[1])

[('author_cakeday', 23),
 ('author_flair_type', 9165),
 ('author_fullname', 9165),
 ('author_patreon_flair', 9165),
 ('author_flair_richtext', 9165)]

In [7]:
len(_)

5

Let's see which attributes never appear with any value other than None

In [8]:
def none_only_attribute(attribute):
    for s in comments:
        if attribute in vars(s).keys() and getattr(s, attribute) != None:
            return False
    return True

none_only_attributes = [a for a in public_attributes if none_only_attribute(a)]
none_only_attributes

['approved_at_utc',
 'approved_by',
 'associated_award',
 'banned_at_utc',
 'banned_by',
 'likes',
 'mod_note',
 'mod_reason_by',
 'mod_reason_title',
 'num_reports',
 'removal_reason',
 'report_reasons']

In [9]:
len(none_only_attributes)

12

Which attributes are not always present, but when present, always have the value None? (This may be an empty set.)

In [10]:
set(none_only_attributes) - set(a for a in none_only_attributes if all(hasattr(s,a) for s in comments))

set()

In [11]:
def value_for_attribute(attribute):
    '''Try to find a non-None value for the given attribute'''
    for s in comments:
        if attribute in vars(s).keys() and getattr(s, attribute) != None:
            return getattr(s, attribute)
    return None

In [12]:
def value_for_attribute_nonfalsy(attribute):
    '''Try to find a non-Falsy value for the given attribute'''
    for s in comments:
        if attribute in vars(s).keys() and getattr(s, attribute):
            return getattr(s, attribute)
    return value_for_attribute(attribute)

Which types are taken on by the values of all the public attributes?

In [13]:
set(type(value_for_attribute(a)) for a in public_attributes)

{NoneType,
 bool,
 dict,
 float,
 int,
 list,
 praw.models.reddit.redditor.Redditor,
 praw.models.reddit.subreddit.Subreddit,
 str}

In [14]:
def attributes_for_type(t):
    return [a for a in public_attributes if type(value_for_attribute(a))==t]

Note the author and subreddit attributes are praw things computed from author_fullname and subreddit_id

Which attributes take on dict values?

In [15]:
dict_attributes = attributes_for_type(dict)
dict_attributes

['gildings']

Which attributes take on list values?

In [16]:
list_attributes = attributes_for_type(list)
list_attributes

['all_awardings',
 'author_flair_richtext',
 'awarders',
 'mod_reports',
 'steward_reports',
 'user_reports']

This code can be used to "sample" interesting values for attributes. It skips Nones and tries to skip empty collections

In [17]:
a = "author_flair_richtext"
v = value_for_attribute_nonfalsy(a)
v if v else value_for_attribute(a)

[{'a': ':flag-nj:',
  'e': 'emoji',
  'u': 'https://emoji.redditmedia.com/k0kxh6bxoxe11_t5_2cneq/flag-nj'},
 {'e': 'text', 't': ' New Jersey'}]

This code computes up all (optionally, non-falsy) values taken on by the attribute

In [18]:
non_falsy = True
[getattr(s, a) for s in comments if a in vars(s).keys() and (not non_falsy or getattr(s, a))]

[[{'a': ':flag-nj:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/k0kxh6bxoxe11_t5_2cneq/flag-nj'},
  {'e': 'text', 't': ' New Jersey'}],
 [{'a': ':flag-pa:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/mu4mvnpxoxe11_t5_2cneq/flag-pa'},
  {'e': 'text', 't': ' Pennsylvania'}],
 [{'a': ':flag-tx:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/3yap2qzxoxe11_t5_2cneq/flag-tx'},
  {'e': 'text', 't': ' Texas'}],
 [{'a': ':flag-gb:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/h0aaa8ce91n11_t5_2cneq/flag-gb'},
  {'e': 'text', 't': ' United Kingdom'}],
 [{'a': ':flag-mi:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/k2igzd16oxe11_t5_2cneq/flag-mi'},
  {'e': 'text', 't': ' Michigan'}],
 [{'a': ':flag-wi:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/1311899yoxe11_t5_2cneq/flag-wi'},
  {'e': 'text', 't': ' Wisconsin'}],
 [{'a': ':flag-nc:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/onk5kb3xoxe11_t5_2cneq/flag-nc'

In [19]:
primitives = [bool, float, int, str]

In [20]:
primitive_attributes = [a for a in public_attributes if type(value_for_attribute(a)) in primitives]
[(a, type(value_for_attribute_nonfalsy(a))) for a in primitive_attributes]

[('archived', bool),
 ('author_cakeday', bool),
 ('author_flair_background_color', str),
 ('author_flair_css_class', str),
 ('author_flair_template_id', str),
 ('author_flair_text', str),
 ('author_flair_text_color', str),
 ('author_flair_type', str),
 ('author_fullname', str),
 ('author_patreon_flair', bool),
 ('body', str),
 ('body_html', str),
 ('can_gild', bool),
 ('can_mod_post', bool),
 ('collapsed', bool),
 ('collapsed_reason', str),
 ('controversiality', int),
 ('created', float),
 ('created_utc', float),
 ('distinguished', str),
 ('downs', int),
 ('edited', float),
 ('gilded', int),
 ('id', str),
 ('is_submitter', bool),
 ('link_id', str),
 ('locked', bool),
 ('name', str),
 ('no_follow', bool),
 ('parent_id', str),
 ('permalink', str),
 ('saved', bool),
 ('score', int),
 ('score_hidden', bool),
 ('send_replies', bool),
 ('stickied', bool),
 ('subreddit_id', str),
 ('subreddit_name_prefixed', str),
 ('subreddit_type', str),
 ('total_awards_received', int),
 ('ups', int)]

In [21]:
def longest_str(attribute):
    return max(len(getattr(s, attribute)) for s in comments if attribute in vars(s).keys() and getattr(s, attribute))

In [22]:
attribute_lengths = {}
for a in primitive_attributes:
    if type(value_for_attribute(a))==str:
        attribute_lengths[a] = longest_str(a)
attribute_lengths

{'author_flair_background_color': 7,
 'author_flair_css_class': 23,
 'author_flair_template_id': 36,
 'author_flair_text': 30,
 'author_flair_text_color': 4,
 'author_flair_type': 8,
 'author_fullname': 11,
 'body': 9908,
 'body_html': 10728,
 'collapsed_reason': 29,
 'distinguished': 9,
 'id': 7,
 'link_id': 9,
 'name': 10,
 'parent_id': 10,
 'permalink': 87,
 'subreddit_id': 8,
 'subreddit_name_prefixed': 10,
 'subreddit_type': 6}