# Reddit API Data Collection
###### By: Nick Gayliard

In [2]:
import requests
import time
import pandas as pd
import numpy as np
import re
import json
import pdb

### GET requests

In [3]:
url = 'https://www.reddit.com/r/nba.json'

req = requests.get(url)

In [4]:
req

<Response [429]>

https://httpstatuses.com/429

### Requests with parameters / queries

The reddit API gave us a 429 (too many requests) error without a 'User-agent' header assigned. That value can be anything in the case of the reddit API. This can differ from API to API, or be completely unneeded. Many APIs will require a private key, given to you by the company. Be sure to PROTECT your API keys, especially ones attached to bank accounts / credit cards (e.g. Amazon Web Services and Google API keys)

In [5]:
req = requests.get(url, headers = {'User-agent' : 'Nick'})

In [6]:
req.status_code

200

In [7]:
req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\\n\\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? \\"Kobe Br

#### Sample URL with a query

- read reddit api documentation: https://www.reddit.com/dev/api/

In [8]:
req2 = requests.get(url, headers = {'User-agent' : 'Nick'}, params = {'after' : 't3_bor3tn'})

In [9]:
req2.url

'https://www.reddit.com/r/nba.json?after=t3_bor3tn'

##### Everything after the '?' symbol in the URL is a query for specific information from the API. You need to check the API documentation to see what variables you can use to grab what information.

In [10]:
req2.url

'https://www.reddit.com/r/nba.json?after=t3_bor3tn'

In [11]:
req2.headers

{'Content-Type': 'application/json; charset=UTF-8', 'x-ua-compatible': 'IE=edge', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'set-cookie': 'loid=000000000040rx2mdj.2.1561579862540.Z0FBQUFBQmRFOUZXV2RoN0xpclJ0N18yR3NBMFpXajBEMGpCSnFRMDZvdi1vWURoT2N4WU5EZkZYMHNGM056Z3FVTGRHeFkyeHp3aE5WRFRubTFUUDRQWUJoV19NN2VlOVNlZXpwQTZ3SEx2WlBiZ1ZlRDBxcTNEVXBFdmpHQjBFUFhsd2ZxXzRoalg; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Fri, 25-Jun-2021 20:11:02 GMT; secure, session_tracker=BqMMKVnHd6xMGCvYMN.0.1561579862540.Z0FBQUFBQmRFOUZXR0trdHE5SUgxNXlsRkdqaHhFN0g0UEdGU0hIYldzMmR5MFpsems5OGZDLXE3MndyYzVHeFNiUW9EUFFiVUNQSWdkZkFzQTl1b1YzVEN1M3F2bGVodEdvcnR6X3RXenVFdTZTdE5lclliWHZRTUNnVEF6alFKbWQwMHFnV1BjN08; Domain=reddit.com; Max-Age=7199; Path=/; expires=Wed, 26-Jun-2019 22:11:02 GMT; secure, edgebucket=QhKNOE8I6q92TegZfc; Domain=reddit.com; Max-Age=63071999; Path=/;  secure', 'access-control-allow-origin': '*', 'access-control-expose-he

### Another reason to not use pd.read_json()

In [97]:
req.text

'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Danny Green on why Marc Gasol didn\'t have a speech at the Raptors parade: \\"He\'s like \'I\'m drunk bro, that\'s my song!\' The real Memphis came out in Marc when he started drinking. I think he bit me at one point too. That\'s when I told Matt, \'You can\'t give Marc the mic. He might say something crazy.\\"](https://streamable.com/73m1y) | [(Comments)](https://reddit.com/r/nba/comments/c5crq0)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydim

In [15]:
df = pd.read_json(req.text)

In [13]:
df

Unnamed: 0,kind,data
after,Listing,t3_c5p5zn
before,Listing,
children,Listing,"[{'kind': 't3', 'data': {'approved_at_utc': No..."
dist,Listing,27
modhash,Listing,


In [16]:
json.loads(req.content).keys()

dict_keys(['kind', 'data'])

### Let's check out our request content

In [17]:
# Lots of crazy bytecode 

req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n\\n# Top Highlights:\\n\\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\\n\\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\\n\\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\\n\\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? \\"Kobe Br

#### Convert it to json and navigate through the json to the data we want

In [18]:
page_pull = req.json()

In [51]:
page_pull

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 27,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'nba',
     'selftext': "Yo back again with another post for this offseason. I'll be updating it basically every night till the beginning of the season. Hit me up if I've missed anybody or anything.\n  \n[Last year's post](https://www.reddit.com/r/nba/comments/8skfre/201819_nba_free_agents_and_team_rosters/)\n  \nUFA - Unrestricted Free Agent  \nRFA - Restricted Free Agent  \nTO - Team Option  \nPO - Player Option  \nS&amp;T - Sign &amp; Trade    \n  \n(INT) - Playing Internationally  \n(FA) - Ended last season not on an active roster \n  \n*****\n\n####ATLANTIC\n  \n[](/BOS) **Boston Celtics**\n  \nCoach: Brad Stevens\n  \nPicks: *14* Romeo Langford | *22* Grant Williams | *33* Carsen Edwards | *51* Tremont Waters\n  \nUFA: Al Horford | RJ Hunter | Kyrie Irving | Marcus Morris Sr\n  \nRFA: PJ Dozier | Jonathan Gibson | Terry Rozier III | Dani

In [19]:
page_pull.keys()

dict_keys(['kind', 'data'])

In [20]:
page_pull['data']

{'modhash': '',
 'dist': 27,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'nba',
    'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Top Highlights:\n\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\n\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\n\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\n\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? "Kobe Bryant"](https://streamable.com/yocnw) 

In [21]:
page_pull['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [22]:
page_pull['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'nba',
   'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n\n# Top Highlights:\n\n0. [Michael Jordan hits a Triple Clutch Layup](https://gfycat.com/leafydimwittedarcticfox) | [(Comments)](https://reddit.com/r/nba/comments/c5pw9l)\n\n0. [Donovan Mitchell\'s luggage gets mixed up with a tourist\'s](https://streamable.com/xt8p8) | [(Comments)](https://reddit.com/r/nba/comments/c5fw47)\n\n0. [Iguodala says Mark Jackson was blackballed from the league for homophobic views; conflicted with Rick Welts (Warriors President) who is gay.](https://streamable.com/1hf45) | [(Comments)](https://reddit.com/r/nba/comments/c5e743)\n\n0. [Andre Iguodala asked: Who\'s tougher to guard, Kawhi Leonard or LeBron James? "Kobe Bryant"](https://streamable.com/yocnw) | [(Comments)](https://reddit.com/r/nba/comme

In [23]:
page_pull['data']['children'][7]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'nba',
  'selftext': 'Literally every instagram post where the winner was announced has these comments, it’s either “How was Kawhi not nominated” or “NURSE ROBBED” and it’s doing my head in. Even Michael B Jordan was a victim regarding Kawhi. It seems more people than i thought doesn’t know that playoff success is completely irrelevant and this confusion needs to be addressed, or even considered.',
  'author_fullname': 't2_wwobg',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 1,
  'clicked': False,
  'title': 'The amount of confusion surrounding casual fans asking why Kawhi or Nurse didn’t win MVP/COTY suggests the Awards placement needs changing',
  'link_flair_richtext': [],
  'subreddit_name_prefixed': 'r/nba',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': None,
  'downs': 0,
  'hide_score': False,
  'name': 't3_c5olzb',
  'quarantine': False,
  'link_flair_text_color': 'dark',
  'author_flair_b

In [24]:
len(page_pull['data']['children'])

27

name, subreddit, selftext, title, num_comments, url, score

In [25]:
# When you are indexing deeply into json, it can help to make variable names for certain levels of indexing
# that you plan on reusing, to improve readability and make sure you don't make indexing errors as often

post_list = page_pull['data']['children']

In [26]:
post_list[1].keys()

dict_keys(['kind', 'data'])

In [27]:
for post in post_list:
    print(post['data']['name'])

t3_c5qx6b
t3_c5oy5m
t3_c5pw9l
t3_c5rayb
t3_c5shdu
t3_c5qc2k
t3_c5oy6t
t3_c5olzb
t3_c5q8u4
t3_c5ta9z
t3_c5qwrb
t3_c5q4pb
t3_c5rax5
t3_c5pfjd
t3_c5p68t
t3_c5ra92
t3_c5qlr9
t3_c5iu4g
t3_c5oadk
t3_c5ufj8
t3_c5rnzj
t3_c5rnbu
t3_c5pnwq
t3_c5pcj6
t3_c5phw1
t3_c5te75
t3_c5p5zn


In [28]:
post_list[0]['data']['title']

'Game Threads Index + Daily Discussion (June 26, 2019)'

### Scrape and build a dictionary to make a dataframe

In [59]:
# Sloppy way! Too much indexing in loop

post_dict = {}

for count, post in enumerate(post_list):
    post_dict[post_list[count]['data']['name']] = [post_list[count]['data']['title'], post_list[count]['data']['num_comments']]

In [29]:
# CLEAN WAY - using an indexer variable!!

post_dict = {}

for count, post in enumerate(post_list):
    post_indexer = post_list[count]['data']
    post_dict[post_indexer['name']] = [post_indexer['title'], post_indexer['num_comments']]

In [30]:
df = pd.DataFrame(post_dict).T
df.columns = ['title', 'num_comments']
df

Unnamed: 0,title,num_comments
t3_c5qx6b,Game Threads Index + Daily Discussion (June 26...,20
t3_c5oy5m,[Serious Discussion] Season Review: Portland T...,114
t3_c5pw9l,Michael Jordan hits a Triple Clutch Layup,1076
t3_c5rayb,[Wojnarowski] Golden State Warriors star Kevin...,922
t3_c5shdu,PSA: Carmelo Anthony is at the same age as Vin...,181
t3_c5qc2k,DeMar DeRozan delivers a powerful message on m...,87
t3_c5oy6t,"Andre Iguodala asked: Who's tougher to guard, ...",919
t3_c5olzb,The amount of confusion surrounding casual fan...,226
t3_c5q8u4,Kobe Bryant makes two defenders collide into e...,160
t3_c5ta9z,[Enes Kanter] I kind of feel like Zion is over...,410


## Put it in a function!

In [34]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-agent' : 'Nick'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in range(40):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        # sleep is a best practice (probably not necessary for such a small scrape)
        time.sleep(.2)
        
    return posts

In [35]:
nba_post_list = scraper_bike('https://www.reddit.com/r/nba.json')

In [36]:
len(nba_post_list)

982

In [31]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    post_dict = {}
    
    for i, post in enumerate(post_list):
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [32]:
posts_to_df(nba_post_list)

NameError: name 'nba_post_list' is not defined

## Couple extra functions for simplicity in running

In [None]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

### Function to scrape and save to csv. HIGHLY recommended when gathering data online that you want to ensure you maintain a copy of locally (and remotely if you want to be secure)

In [None]:
# NOTE: YOU NEED A CSV ALREADY MADE TO SAVE TO IN THIS CASE. 
# YOU COULD ADD CODE TO CREATE A NEW CSV IF NONE EXISTS

# scrape, import csv, concat, drop duplicate, and output to csv

# takes in scraper function, url, csv filename to import, csv filename to output

# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    
    scrape_df = posts_to_df(scrape_func(url))
    
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    
    concat_df = pd.concat([imported_df, scrape_df])
    
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    
    concat_df.to_csv(export_file)