# Reddit API Data Scraping
---

In this notebook, I utilize Reddit's built in API .json functionality to scrape post data from four subreddits. I then export this data into .csv files to use in my analysis notebook. 

My chosen subreddits are as follows:

- r/nba
- r/nfl
- r/cfb
- r/CollegeBasketball

I have taken mostly new posts from the subreddits, but I have also supplemented this with the top 500 posts from the past year into each dataset.

In [2]:
# import libraries
import requests
import time
import pandas as pd

In [3]:
# update pandas global settings to view all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# import existing subreddit data
nba_df = pd.read_csv('./data/nba_subreddit_data.csv')
nfl_df = pd.read_csv('./data/nfl_subreddit_data.csv')
cfb_df = pd.read_csv('./data/cfb_subreddit_data.csv')
cbb_df = pd.read_csv('./data/cbb_subreddit_data.csv')

In [5]:
# check shape of dataframes
print(nba_df.shape)
print(nfl_df.shape)
print(cfb_df.shape)
print(cbb_df.shape)

(2507, 100)
(2976, 100)
(1984, 103)
(1995, 104)


In [6]:
# enter subreddit urls
nba_url = 'https://www.reddit.com/r/nba.json'
nfl_url = 'https://www.reddit.com/r/nfl.json'
cfb_url = 'https://www.reddit.com/r/cfb.json'
cbb_url = 'https://www.reddit.com/r/collegebasketball.json'
# establish our header
header = {'User-agent': 'subreddit get requests'}

In [7]:
# initial get request to test API
res = requests.get(cfb_url, headers=header)
cfb_res = res.json()

In [8]:
# check request status
res.status_code

200

In [9]:
# explore keys for test request
cfb_res['data']['children'][0]['data'].keys()

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'banned_by', 'author_flair_type', 'domain', 'allow_live_comments', 'selftext_html', 'likes', 'suggested_sort', 'banned_at_utc', 'view_count', 'archived', 'no_follow', 'is_crosspostable', 

In [10]:
# define function to get num pages of posts from a subreddit, start collecting at a defined after
def reddit_scraper(url, num, after = None):
    posts = []
    # loop through the num pages, each subreddit .json returns 25 posts 
    for page in range(num):
        # initiate params modifier for posts if there no defined after
        if after == None:
            params = {}
        # add in after id for each loop following to ensure no duplicate posts
        else:
            params = {'after': after}
        # call our get request for the posts
        res = requests.get(url, params=params, headers=header)
        # check status code, 200 means posts were successfully downloaded
        if res.status_code == 200:
            # convert request to .json
            new_json = res.json()
            # extend list from the 'children' dictionary for each request
            posts.extend(new_json['data']['children'])
            # update after id
            after = new_json['data']['after']
        else:
            # print status code if not 200
            print(res.status_code)
            break
        # wait 1 second
        time.sleep(1)
        
    # create a new dataframe with the 'data' from each post
    new_df = pd.DataFrame([post['data'] for post in posts])
    
    # print final value of after
    print(f'Final value of after parameter: {after}')
    
    # return the dataframe
    return new_df

## Data from r/nba
---

In [11]:
# call subreddit scraping function
new_nba_df = reddit_scraper(nba_url, 10)

Final value of after parameter: t3_cc3vwn


In [12]:
# check shape of scraped dataframe
new_nba_df.shape

(252, 98)

In [13]:
new_nba_df = pd.concat([nba_df, new_nba_df], axis=0, sort=True)

In [14]:
# confirm concatenation
new_nba_df.shape

(2759, 100)

In [15]:
# reset index
new_nba_df.reset_index(drop=True, inplace=True)

In [16]:
# count number of unique posts
new_nba_df['name'].nunique()

2062

In [17]:
# export CSV of original and new data
new_nba_df.to_csv("./data/nba_subreddit_data.csv", index=False)
nba_df.to_csv("./data/nba_subreddit_data - backup.csv", index=False)

## Data from r/nfl
---

In [33]:
# call subreddit scraping function
new_nfl_df = reddit_scraper(nfl_url, 40)

Final value of after parameter: None


In [34]:
# check shape of scraped dataframe
new_nfl_df.shape

(995, 99)

In [35]:
# combine dataframes and reset index
new_nfl_df = pd.concat([nfl_df, new_nfl_df], axis=0, sort=True)
new_nfl_df.reset_index(drop=True, inplace=True)

In [36]:
# confirm concatenation
new_nfl_df.shape

(2976, 100)

In [37]:
# count number of unique posts
new_nfl_df['name'].nunique()

1787

In [38]:
# export CSV of original and new data
new_nfl_df.to_csv("./data/nfl_subreddit_data.csv", index=False)
nfl_df.to_csv("./data/nfl_subreddit_data - backup.csv", index=False)

## Data from r/cfb
---

In [35]:
# call subreddit scraping function
new_cfb_df = reddit_scraper(cfb_url, 20)

Final value of after parameter: t3_capdm9


In [36]:
# check shape of scraped dataframe
new_cfb_df.shape

(984, 103)

In [26]:
# combine dataframes and reset index
new_cfb_df = pd.concat([cfb_df, new_cfb_df], axis=0, sort=True)
new_cfb_df.reset_index(drop=True, inplace=True)

In [27]:
# confirm concatenation
new_cfb_df.shape

(1984, 103)

In [28]:
# count number of unique posts
new_cfb_df['name'].nunique()

1861

In [29]:
# export CSV of original and new data
new_cfb_df.to_csv("./data/cfb_subreddit_data.csv", index=False)
cfb_df.to_csv("./data/cfb_subreddit_data - backup.csv", index=False)

## Data from r/CollegeBasketball
---

In [37]:
# call subreddit scraping function
new_cbb_df = reddit_scraper(cbb_url, 40)

Final value of after parameter: t3_car435


In [38]:
# check shape of scraped dataframe
new_cbb_df.shape

(995, 104)

In [39]:
# combine dataframes and reset index
new_cbb_df = pd.concat([cbb_df, new_cbb_df], axis=0, sort=True)
new_cbb_df.reset_index(drop=True, inplace=True)

In [40]:
# confirm concatenation
new_cbb_df.shape

(1995, 104)

In [41]:
# count number of unique posts
new_cbb_df['name'].nunique()

1885

In [42]:
# export CSV of original and new data
new_cbb_df.to_csv("./data/cbb_subreddit_data.csv", index=False)
cbb_df.to_csv("./data/cbb_subreddit_data - backup.csv", index=False)