# Reddit API Web Scraper
I started building a true web scraper using BeautifulSoup, and later discovered Reddit has an API to grab content, so I'm going to take advantage of that instead.

In [5]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
import requests
from tqdm import tqdm_notebook

# Local file that stores my credentials for accessing the API
from reddit_oauth import username, password, client_id, client_secret

In [8]:
subreddit = 'summonerschool'

client_auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
post_data = {"grant_type": "password", "username": username, "password": password,"redirect_uri":'http://localhost'}
user_agent = {"User-Agent": "Comment Scraper app by /u/" + username}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=user_agent)

headers = {"Authorization": response.json()['token_type'] + " " + response.json()['access_token'],
           "User-Agent": "Comment Scraper app by /u/" + username}

# This is the webpage I want to scrape. For this intial scrape, I want to make sure
# I am updating my user-agent because I'm not accessing the reddit API just yet.
main_json = requests.get('http://www.reddit.com/r/' + subreddit + '/.json',headers=user_agent)

# Create skeleton dictionary with thread titles as keys.
master = {}
for each in main_json.json()['data']['children']:
    master.setdefault(each['data']['title'],{'id':each['data']['id'],'json':[],'comments':[],'more':[],'expected_comments':each['data']['num_comments'],'actual_comments':0})   

In [9]:
def check_more(comment):
    if comment['kind']=='more':
        if comment['data']['parent_id'][0:2]=="t3":
            v['more'].extend(tuple(comment['data']['children']))
        return True

def get_more_children(parent,comment):
    more_comment_ids = comment['data']['children']
    more_comments = requests.get('https://oauth.reddit.com/api/morechildren',headers=headers,
                        params={'children':','.join(more_comment_ids),'link_id':link_id})
    more_comments = more_comments.json()['jquery'][14][3][0]
    for more in more_comments:
        if check_more(more):
            if more['data']['count']==0:
                continue
            get_more_children(parent,more)
            continue
        parent.append(more['data']['body'].lower())
        get_children(parent,more)

def get_children(parent,comment):
    if comment['data']['replies'] != '':
        children = comment['data']['replies']['data']['children']
        for child in children:
            if check_more(child):
                if child['data']['count']==0:
                    continue
                get_more_children(parent,child)
                continue
            else:
                parent.append(child['data']['body'].lower())
                get_children(parent,child)
            
def get_parents(comments):
    for each in tqdm_notebook(comments,desc='Parents',leave=False):
        if check_more(each): 
            continue
        parent = [each['data']['body'].lower()]
        v['comments'].append(parent)
        get_children(parent,each)

def get_more_parents(more_parents):
    for parent_id in tqdm_notebook(more_parents,desc='More Parents',leave=False):
        parent_comment = requests.get(url,headers=headers,
                                     params={'comment':parent_id,'showmore':True}) 
        parent_comments = parent_comment.json()[1]['data']['children']
        get_parents(parent_comments)
        time.sleep(0.2)        

In [10]:
for k,v in tqdm_notebook(master.items(),desc='Thread'):
    
    url = 'https://oauth.reddit.com/r/' + subreddit + '/comments/' + v['id']
    v['json'] = requests.get(url,headers=headers,
                    params={'showmore':True,'limit':100000})
    
    if v['json'].status_code!=200:
        v['success'] = False
    
    link_id = v['json'].json()[0]['data']['children'][0]['data']['name']
    comments = v['json'].json()[1]['data']['children']
    get_parents(comments)
    get_more_parents(v['more'])
    
    for comments in v['comments']:
        v['actual_comments'] += len(comments)




In [12]:
for k,v in master.items():
    print(k,v['actual_comments'],"/",v['expected_comments'])

Simple Questions Simple Answers: 60 1484 / 1479
What team comps are the following mid laners good in or against? 0 / 0
Team Comps, and Different Ways to End? 6 / 6
Things you're doing wrong that you probably didn't realise: Support Edition 39 / 38
In lower ELO, why do teams group mid so fast? 38 / 38
Weekly Replay Review Thread: Week 38 10 / 10
When to group and when not too. 6 / 6
Shyvana micro and skins... Hm? 0 / 0
Chogath Feast indicator not showing in-game 5 / 5
A guide to being an One Trick Pony 132 / 133
wave managment 2 / 2
Crazy idea I came upon. And I wonder if this could work? 1 / 1
would this warwick idea work (mechanics, interactions) 6 / 6
ADC's situation after changing botlane turrets - having problem and looking for solution 7 / 7
Is it OK to /mute all on every single game? 32 / 32
Where am I supposed to go as an ADC? 3 / 3
Looking for some Shyvana tips 17 / 16
I ban Yasuo when someone hovers it. Is it normal to do? 15 / 15
New Free Champion Rotation 17 / 17
Any simple 

# Get more pages of threads

In [None]:
# Get after key. This will move the subreddit to the next page
main_json.json()['data']['after']
test = requests.get('http://www.reddit.com/r/summonerschool/.json',headers=headers,
                   params={'after':main_json.json()['data']['after']})