# Reddit API Data Collection
###### By: Nick Gayliard

In [1]:
import requests
import time
import pandas as pd
import numpy as np
import re
import json
import pdb

### GET requests

In [2]:
url = 'https://www.reddit.com/r/nba.json'

req = requests.get(url)

In [3]:
req

<Response [429]>

https://httpstatuses.com/429

### Requests with parameters / queries

The reddit API gave us a 429 (too many requests) error without a 'User-agent' header assigned. That value can be anything in the case of the reddit API. This can differ from API to API, or be completely unneeded. Many APIs will require a private key, given to you by the company. Be sure to PROTECT your API keys, especially ones attached to bank accounts / credit cards (e.g. Amazon Web Services and Google API keys)

In [4]:
req = requests.get(url, headers = {'User-agent' : 'Nico'})

In [5]:
req.status_code

200

In [6]:
req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 26, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "Hello All, if you missed it, we\'re now in the voting round of our first [r/nba Fan Art contest!](https://www.reddit.com/r/nba/comments/cjfwxc/rnba_fan_art_contest_1_submissions_open/)\\n\\nAll submissions have been consolidated by submitter, so anyone that provided multiple submissions, will have one entry to vote on. This post will be put up once I have put up all the options, so everyone\'s submission will have the same amount of time to gather votes. Voting will last for 3 days!\\n\\nPlease vote for your favorite submissions!\\n\\nThanks to everyone who submitted, and best of luck!", "author_fullname": "t2_eub4w", "saved": false, "mod_reason_title": null, "gilded": 0, "clicked": false, "title": "r/nba Fan Art Contest #1 - Voting Open!", "link_flair_richtext": [], "subreddit_name_prefixed": "r/nba", "hidden": false, "pwls": 6, "link

#### Sample URL with a query

In [6]:
req2 = requests.get(url, headers = {'User-agent' : 'Jonnel'}, params = {'before' : 't3_c5rayb'})

In [7]:
req2.url

'https://www.reddit.com/r/nba.json?before=t3_c5rayb'

##### Everything after the '?' symbol in the URL is a query for specific information from the API. You need to check the API documentation to see what variables you can use to grab what information.

In [8]:
req2.url

'https://www.reddit.com/r/nba.json?before=t3_c5rayb'

In [9]:
req2.text

'{"kind": "Listing", "data": {"modhash": "", "dist": 0, "children": [], "after": null, "before": null}}'

### Another reason to not use pd.read_json()

In [10]:
req2.text

'{"kind": "Listing", "data": {"modhash": "", "dist": 0, "children": [], "after": null, "before": null}}'

In [11]:
df = pd.read_json(req.text)

In [12]:
df

Unnamed: 0,kind,data
after,Listing,t3_cmrfik
before,Listing,
children,Listing,"[{'kind': 't3', 'data': {'approved_at_utc': No..."
dist,Listing,26
modhash,Listing,


In [13]:
json.loads(req.content).keys()

dict_keys(['kind', 'data'])

### Let's check out our request content

In [14]:
# Lots of crazy bytecode 

req2.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 0, "children": [], "after": null, "before": null}}'

#### Convert it to json and navigate through the json to the data we want

In [21]:
page_pull = req.json()

In [22]:
page_pull

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 26,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'nba',
     'selftext': "Hello All, if you missed it, we're now in the voting round of our first [r/nba Fan Art contest!](https://www.reddit.com/r/nba/comments/cjfwxc/rnba_fan_art_contest_1_submissions_open/)\n\nAll submissions have been consolidated by submitter, so anyone that provided multiple submissions, will have one entry to vote on. This post will be put up once I have put up all the options, so everyone's submission will have the same amount of time to gather votes. Voting will last for 3 days!\n\nPlease vote for your favorite submissions!\n\nThanks to everyone who submitted, and best of luck!",
     'author_fullname': 't2_eub4w',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': 'r/nba Fan Art Contest #1 - Voting Open!',
     'link_flair_richtext': [],
     'subreddit_name_pref

In [23]:
page_pull.keys()

dict_keys(['kind', 'data'])

In [24]:
page_pull['data']

{'modhash': '',
 'dist': 26,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'nba',
    'selftext': "Hello All, if you missed it, we're now in the voting round of our first [r/nba Fan Art contest!](https://www.reddit.com/r/nba/comments/cjfwxc/rnba_fan_art_contest_1_submissions_open/)\n\nAll submissions have been consolidated by submitter, so anyone that provided multiple submissions, will have one entry to vote on. This post will be put up once I have put up all the options, so everyone's submission will have the same amount of time to gather votes. Voting will last for 3 days!\n\nPlease vote for your favorite submissions!\n\nThanks to everyone who submitted, and best of luck!",
    'author_fullname': 't2_eub4w',
    'saved': False,
    'mod_reason_title': None,
    'gilded': 0,
    'clicked': False,
    'title': 'r/nba Fan Art Contest #1 - Voting Open!',
    'link_flair_richtext': [],
    'subreddit_name_prefixed': 'r/nba',
    'hidden': False,
    '

In [25]:
page_pull['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [26]:
page_pull['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'nba',
   'selftext': "Hello All, if you missed it, we're now in the voting round of our first [r/nba Fan Art contest!](https://www.reddit.com/r/nba/comments/cjfwxc/rnba_fan_art_contest_1_submissions_open/)\n\nAll submissions have been consolidated by submitter, so anyone that provided multiple submissions, will have one entry to vote on. This post will be put up once I have put up all the options, so everyone's submission will have the same amount of time to gather votes. Voting will last for 3 days!\n\nPlease vote for your favorite submissions!\n\nThanks to everyone who submitted, and best of luck!",
   'author_fullname': 't2_eub4w',
   'saved': False,
   'mod_reason_title': None,
   'gilded': 0,
   'clicked': False,
   'title': 'r/nba Fan Art Contest #1 - Voting Open!',
   'link_flair_richtext': [],
   'subreddit_name_prefixed': 'r/nba',
   'hidden': False,
   'pwls': 6,
   'link_flair_css_class': None,
   'downs': 

In [27]:
page_pull['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'nba',
  'selftext': "Hello All, if you missed it, we're now in the voting round of our first [r/nba Fan Art contest!](https://www.reddit.com/r/nba/comments/cjfwxc/rnba_fan_art_contest_1_submissions_open/)\n\nAll submissions have been consolidated by submitter, so anyone that provided multiple submissions, will have one entry to vote on. This post will be put up once I have put up all the options, so everyone's submission will have the same amount of time to gather votes. Voting will last for 3 days!\n\nPlease vote for your favorite submissions!\n\nThanks to everyone who submitted, and best of luck!",
  'author_fullname': 't2_eub4w',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 0,
  'clicked': False,
  'title': 'r/nba Fan Art Contest #1 - Voting Open!',
  'link_flair_richtext': [],
  'subreddit_name_prefixed': 'r/nba',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': None,
  'downs': 0,
  'hide_score

In [28]:
len(page_pull['data']['children'])

26

name, subreddit, selftext, title, num_comments, url, score

In [29]:
# When you are indexing deeply into json, it can help to make variable names for certain levels of indexing
# that you plan on reusing, to improve readability and make sure you don't make indexing errors as often

post_list = page_pull['data']['children']

In [30]:
post_list[1].keys()

dict_keys(['kind', 'data'])

In [31]:
for post in post_list:
    print(post['data']['name'])

t3_cmbyac
t3_cmorxb
t3_cmqlzt
t3_cmq8hj
t3_cmp2df
t3_cmrb4d
t3_cmrlb0
t3_cmrwzz
t3_cmqjav
t3_cmrzn9
t3_cmnr51
t3_cmrjca
t3_cmvagp
t3_cmrsi3
t3_cmsn3v
t3_cmh7nn
t3_cmqnca
t3_cmrvat
t3_cmjvl4
t3_cmvjf3
t3_cmspgg
t3_cmhtbc
t3_cmpnap
t3_cml15n
t3_cmq9jb
t3_cmrfik


In [32]:
post_list[0]['data']['title']

'r/nba Fan Art Contest #1 - Voting Open!'

In [37]:
for post in post_list:
    print(post['data']['title'])

r/nba Fan Art Contest #1 - Voting Open!
Vince Carter is older than teammate, Trae Young’s dad
When the Raptors wore crip blue, DeMar DeRozan averaged 31.8 points, 7.3 rebounds, 3.5 assists and 1.2 steals
Shane Battier's real name is Shane Battle. His father's name was written incorrectly by an army recruiter and he decided to go by that after the war
P.J. Tucker falls asleep on inbound pass.
The James Family reminds us what day it is
Pending free agent Zach Lowe with no shortage of suitors for. ESPN is committed to keeping him, but with so much money already committed to Adrian Wojnarowski and Brian Windhorst, there may be a limit on what the company is willing to spend.
NBA, Twitch announce deal for digital rights to USA Basketball
[Marks] In 2021, 26 teams will have max cap space (only Brooklyn, Golden State, Houston and Philadelphia likely will be capped out) and with the cap increasing to $125 million (from $109.1 million this season), there could be significant roster turnover yet

### Scrape and build a dictionary to make a dataframe

In [54]:
# Sloppy way! Too much indexing in loop


post_dict = {}

for count, post in enumerate(post_list):
    post_dict[post_list[count]['data']['name']] = [post_list[count]['data']['title'], post_list[count]['data']['num_comments']]

In [38]:
# CLEAN WAY - using an indexer variable!!

post_dict = {}

for count, post in enumerate(post_list):
    post_indexer = post_list[count]['data']
    post_dict[post_indexer['name']] = [post_indexer['title'], post_indexer['num_comments']]

In [39]:
df = pd.DataFrame(post_dict).T
df.columns = ['title', 'num_comments']
df

Unnamed: 0,title,num_comments
t3_cmbyac,r/nba Fan Art Contest #1 - Voting Open!,43
t3_cmorxb,"Vince Carter is older than teammate, Trae Youn...",354
t3_cmqlzt,"When the Raptors wore crip blue, DeMar DeRozan...",217
t3_cmq8hj,Shane Battier's real name is Shane Battle. His...,137
t3_cmp2df,P.J. Tucker falls asleep on inbound pass.,193
t3_cmrb4d,The James Family reminds us what day it is,114
t3_cmrlb0,Pending free agent Zach Lowe with no shortage ...,366
t3_cmrwzz,"NBA, Twitch announce deal for digital rights t...",105
t3_cmqjav,"[Marks] In 2021, 26 teams will have max cap sp...",167
t3_cmrzn9,TJ McConnell says he's excited to play with te...,58


## Put it in a function!

In [41]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-agent' : 'Jonnel'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in range(40):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json() # remeber to turn it into json
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        # sleep is a best practice (probably not necessary for such a small scrape)

        
    return posts

In [42]:
nba_post_list = scraper_bike('https://www.reddit.com/r/nba.json')

In [43]:
len(nba_post_list)

977

In [44]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    post_dict = {}
    
    for i, post in enumerate(post_list):
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [45]:
posts_to_df(nba_post_list)

Unnamed: 0,subreddit,title,selftext
t3_cmbyac,nba,r/nba Fan Art Contest #1 - Voting Open!,"Hello All, if you missed it, we're now in the ..."
t3_cmorxb,nba,"Vince Carter is older than teammate, Trae Youn...",Vince Carter age: 42\n\nBorn: 26 January 1977\...
t3_cmqlzt,nba,"When the Raptors wore crip blue, DeMar DeRozan...",https://www.basketball-reference.com/players/d...
t3_cmq8hj,nba,Shane Battier's real name is Shane Battle. His...,
t3_cmp2df,nba,P.J. Tucker falls asleep on inbound pass.,
t3_cmrb4d,nba,The James Family reminds us what day it is,
t3_cmrlb0,nba,Pending free agent Zach Lowe with no shortage ...,
t3_cmrwzz,nba,"NBA, Twitch announce deal for digital rights t...",
t3_cmqjav,nba,"[Marks] In 2021, 26 teams will have max cap sp...",
t3_cmrzn9,nba,TJ McConnell says he's excited to play with te...,


## Couple extra functions for simplicity in running

In [86]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

### Function to scrape and save to csv. HIGHLY recommended when gathering data online that you want to ensure you maintain a copy of locally (and remotely if you want to be secure)

In [87]:
# NOTE: YOU NEED A CSV ALREADY MADE TO SAVE TO IN THIS CASE. 
# YOU COULD ADD CODE TO CREATE A NEW CSV IF NONE EXISTS

# scrape, import csv, concat, drop duplicate, and output to csv

# takes in scraper function, url, csv filename to import, csv filename to output

# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    
    scrape_df = posts_to_df(scrape_func(url))
    
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    
    concat_df = pd.concat([imported_df, scrape_df])
    
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    
    concat_df.to_csv(export_file)