In [1]:
import pandas as pd
import time
import datetime
import requests

#### Step one: make a smiple request to the PushShift API

In [2]:
# define url
url = "https://api.pushshift.io/reddit/search/submission?subreddit=physics"

In [3]:
# use requests to submit your call
res = requests.get(url)

In [4]:
res.status_code

200

#### Step two: extract the data

In [8]:
# extract
res.json()['data']

[{'subreddit': 'Physics',
  'selftext': '',
  'author_fullname': 't2_dajlcs32',
  'gilded': 0,
  'title': 'Is perspective a dimension?',
  'link_flair_richtext': [],
  'subreddit_name_prefixed': 'r/Physics',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': 'question',
  'thumbnail_height': None,
  'top_awarded_type': None,
  'hide_score': True,
  'quarantine': False,
  'link_flair_text_color': 'dark',
  'upvote_ratio': 1.0,
  'author_flair_background_color': None,
  'subreddit_type': 'public',
  'total_awards_received': 0,
  'media_embed': {},
  'thumbnail_width': None,
  'author_flair_template_id': None,
  'is_original_content': False,
  'secure_media': None,
  'is_reddit_media_domain': False,
  'is_meta': False,
  'category': None,
  'secure_media_embed': {},
  'link_flair_text': 'Question',
  'score': 1,
  'is_created_from_ads_ui': False,
  'author_premium': False,
  'thumbnail': 'self',
  'edited': False,
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  '

In [9]:
df = pd.DataFrame(res.json()['data'])
df

Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,is_video,retrieved_utc,updated_utc,utc_datetime_str,post_hint,url_overridden_by_dest,preview,is_gallery,media_metadata,gallery_data
0,Physics,,t2_dajlcs32,0,Is perspective a dimension?,[],r/Physics,False,6,question,...,False,1671152800,1671152801,2022-12-16 01:06:28,,,,,,
1,Physics,,t2_7ozb5t3u,0,"can someone explain this, the ice froze upward...",[],r/Physics,False,6,image,...,False,1671151062,1671151063,2022-12-16 00:37:24,image,https://i.redd.it/lhp9xg2fo56a1.png,{'images': [{'source': {'url': 'https://previe...,,,
2,Physics,Can a current applied to a material affect its...,t2_8qh8gs8h,0,If heat can change the conductivity of a mater...,[],r/Physics,False,6,question,...,False,1671149967,1671149968,2022-12-16 00:19:12,,,,,,
3,Physics,"Hey everyone, with the new breakthrough in fus...",t2_kcl9ib2z,0,Tritium breeding/ production?,[],r/Physics,False,6,question,...,False,1671147514,1671147515,2022-12-15 23:38:18,,,,,,
4,Physics,,t2_t6enp,0,Built a silicon diode based particle detector,[],r/Physics,False,6,,...,False,1671138390,1671138391,2022-12-15 21:06:13,,https://www.reddit.com/gallery/zmw3bu,,True,"{'1gf96zeum46a1': {'status': 'valid', 'e': 'Im...","{'items': [{'media_id': 'rnps2r8um46a1', 'id':..."
5,Physics,"yes, i couldn't get into a level maths, so i k...",t2_ca06m57d,0,want to go to uni for physics but couldn't get...,[],r/Physics,False,6,,...,False,1671138271,1671138272,2022-12-15 21:04:14,,,,,,
6,Physics,I got into a debate with someone about free wi...,t2_bb2jeaw6,0,Counter argument to reductionist materialism?,[],r/Physics,False,6,question,...,False,1671136225,1671136226,2022-12-15 20:30:11,,,,,,
7,Physics,"Hi everyone, hope you are well. \n\nDo you kno...",t2_37twd9i2,0,Where is Hubert Reeves ?,[],r/Physics,False,6,question,...,False,1671134270,1671134270,2022-12-15 19:57:38,,,,,,
8,Physics,One of the prerequisites of the masters projec...,t2_acc4gp6h,0,Recommended textbooks for learning Yang Mills ...,[],r/Physics,False,6,question,...,False,1671134211,1671134212,2022-12-15 19:56:39,,,,,,
9,Physics,"You are standing on Col d’Aubisque, 1710m abov...",t2_6nbqqo46,0,"Tomorrow exam, please help",[],r/Physics,False,6,,...,False,1671131969,1671131969,2022-12-15 19:19:15,,,,,,


In [10]:
df['id'].nunique() == len(df)

True

Take some time to look through all the different info (`fields`) that were retrieved.

#### Step three: let's make it a bit more complex
Instead of joining all different parameters together in a string, let's make use of the fact that we can pass a dictionary of parameters in the `.get()` function. Let's also use some error handling to only construct the dataframe if the request is successful. <br>
<br>
Take a minute to look through the parameters for the submissions endpoint and see what might be useful...

In [8]:
base_url = "https://api.pushshift.io/reddit/search/submission"
params = {
    'subreddit': 'physics', 
    'size': 1000,
}

res = requests.get(base_url, params)

if res.status_code == 200:
    posts = pd.DataFrame(res.json()['data'])
    print(len(posts))
else:
    print(f'status: {res.status_code}')

1000


In [9]:
# if you want to use the after parameter
time_min = posts['created_utc'].min()

Let's make a first loop

In [4]:
# make sure to hard code this before project submission
time_now = int(datetime.datetime.now().timestamp())
time_now

1671230994

In [5]:
# subreddit was created 3/16/08, let's start at 3/17/08 to be safe
time_min = int(datetime.datetime(2008, 3, 17).timestamp())
time_min

1205737200

In [61]:
base_url = "https://api.pushshift.io/reddit/search/submission"
params = {
    'subreddit': 'nostupidquestions', 
    'size'     : 1000,
    'before'   : time_now,
    'after'    : time_min    
}

all_posts = []
for _ in range(100):
# while time_to_pull>time_min
    res = requests.get(base_url, params)
    try:
        posts = pd.DataFrame(res.json()['data'])
        all_posts.append(posts)
        params['before'] = posts['created_utc'].min()
    except:
        print(f'status: {res.status_code}')
        break
        
all_posts = pd.concat(all_posts)

status: 504


In [62]:
all_posts.shape

(12987, 91)

In [24]:
all_posts['id'].nunique()

15984

In [25]:
all_posts.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'content_categories', 'is_self',
       'link_flair_type', 'wls', 'removed_by_category', 'author_flair_type',
       'domain', 'allow_live_comments', 'suggested_sort', 'view_count',
       'archived', 'no_follow', 'is_crosspostable', 'pinned', 'over_18'

In [26]:
all_posts['created_utc'].min()

1670203660

In [40]:
all_posts['id'].nunique() - (all_posts['title']=='').sum()

15984

In [58]:
base_url = "https://api.pushshift.io/reddit/search/submission"
params = {
    'subreddit': 'explainlikeimfive', 
    'size'     : 1000,
    'before'   : time_now,
    'after'    : time_min    
}

all_posts2 = []
time_to_pull = time_now
for _ in range(100):
# while time_to_pull>time_min
    params['before'] = time_to_pull
    res = requests.get(base_url, params)
    try:
        posts = pd.DataFrame(res.json()['data'])
        all_posts2.append(posts)
        time_to_pull = posts['created_utc'].min()
    except:
        print(f'status: {res.status_code}')
        break
        
all_posts2 = pd.concat(all_posts2)

status: 200


In [59]:
all_posts2.shape

(10003, 91)

In [60]:
all_posts2['created_utc'].min()

1667503942

In [37]:
(all_posts2['selftext']=='').sum()

13404

In [39]:
all_posts2['id'].nunique() - (all_posts2['title']=='').sum()

13997

In [43]:
all_posts.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'content_categories', 'is_self',
       'link_flair_type', 'wls', 'removed_by_category', 'author_flair_type',
       'domain', 'allow_live_comments', 'suggested_sort', 'view_count',
       'archived', 'no_follow', 'is_crosspostable', 'pinned', 'over_18'

In [44]:
all_posts2.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'content_categories', 'is_self',
       'link_flair_type', 'wls', 'removed_by_category', 'author_flair_type',
       'domain', 'allow_live_comments', 'suggested_sort', 'view_count',
       'archived', 'no_follow', 'is_crosspostable', 'pinned', 'over_18'

## Exceptional, going to write these to csv and try them out!

In [45]:
all_posts.to_csv('../data/nostupidquestions_new.csv')
all_posts2.to_csv('../data/askreddit_new.csv')