# Reddit API Web Scraper
I started building a true web scraper using BeautifulSoup, and later discovered Reddit has an API to grab content, so I'm going to take advantage of that instead.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import requests
from tqdm import tqdm_notebook

# Local file that stores my credentials for accessing the API
from reddit_oauth import username, password, client_id, client_secret

In [5]:
def check_more(comment):
    if comment['kind']=='more':
        if comment['data']['parent_id'][0:2]=="t3":
            v['more'].extend(tuple(comment['data']['children']))
        return True

def get_more_children(parent,comment):
    more_comment_ids = comment['data']['children']
    #more_comments = requests.get('https://oauth.reddit.com/api/morechildren',headers=headers,
    #                    params={'children':','.join(more_comment_ids),'link_id':link_id})
    more_comments = requests.get('https://oauth.reddit.com/api/morechildren',headers=headers,
                        params={'children':more_comment_ids,'link_id':link_id})
    more_comments = more_comments.json()['jquery'][14][3][0]
    for more in more_comments:
        if check_more(more):
            if more['data']['count']==0:
                continue
            get_more_children(parent,more)
            continue
        parent.append(more['data']['body'].lower())
        get_children(parent,more)

def get_children(parent,comment):
    if comment['data']['replies'] != '':
        children = comment['data']['replies']['data']['children']
        for child in children:
            if check_more(child):
                if child['data']['count']==0:
                    continue
                get_more_children(parent,child)
                continue
            else:
                parent.append(child['data']['body'].lower())
                get_children(parent,child)
            
def get_parents(comments):
    for each in tqdm_notebook(comments,desc='Parents',leave=False):
        if check_more(each): 
            continue
        parent = [each['data']['body'].lower()]
        v['comments'].append(parent)
        get_children(parent,each)

def get_more_parents(more_parents):
    for parent_id in tqdm_notebook(more_parents,desc='More Parents',leave=False):
        parent_comment = requests.get(url,headers=headers,params={'comment':parent_id}) 
        parent_comments = parent_comment.json()[1]['data']['children']
        get_parents(parent_comments)
        time.sleep(0.3)        

In [6]:
client_auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
post_data = {"grant_type": "password", "username": username, "password": password,"redirect_uri":'http://localhost'}
user_agent = {"User-Agent": "Comment Scraper app by /u/" + username}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=user_agent)

headers = {"Authorization": response.json()['token_type'] + " " + response.json()['access_token'],
           "User-Agent": "Comment Scraper app by /u/" + username}

In [7]:
subreddit = 'summonerschool'
num_pages = 4

# Create skeleton dictionary with thread titles as keys.
master = {}
after = None

for pages in range(num_pages):
    # This is the webpage I want to scrape. For this intial scrape, I want to make sure
    # I am updating my user-agent because I'm not accessing the reddit API just yet.
    url = 'http://www.reddit.com/r/' + subreddit + '/.json'
    main_json = requests.get(url,headers=user_agent,params={'after':after})
    after = main_json.json()['data']['after']

    for each in main_json.json()['data']['children']:
        master.setdefault(each['data']['title'],{'id':each['data']['id'],'json':[],'comments':[],'more':[],'expected_comments':each['data']['num_comments'],'actual_comments':0})   

for k,v in tqdm_notebook(master.items(),desc='Thread',leave=False):
    
    url = 'https://oauth.reddit.com/r/' + subreddit + '/comments/' + v['id']
    v['json'] = requests.get(url,headers=headers)
    
    if v['json'].status_code!=200:
        print("JSON Retrieval Error")
        raise KeyboardInterrupt
   
    link_id = v['json'].json()[0]['data']['children'][0]['data']['name']
    parents = v['json'].json()[1]['data']['children']
    get_parents(parents)
    get_more_parents(v['more'])
    
    for comments in v['comments']:
        v['actual_comments'] += len(comments)



Check how many comments I have for each thread

In [8]:
for k, v in master.items():
    print(k,"Actual:",v['actual_comments'],"Expected:",v['expected_comments'])

I often don't know what to do with myself past laning. Actual: 12 Expected: 12
Help me please. [Plat 5] Actual: 8 Expected: 8
How do you defend turrets as a full melee champion? Actual: 20 Expected: 19
Increase the mini map size. Actual: 2 Expected: 2
What are some tips to beating Orianna in laning phase? Actual: 8 Expected: 8
I want to be a OTP (or two) Actual: 31 Expected: 31
Veigar Support at Low Elo? Actual: 8 Expected: 8
How to become a better solo laner? Actual: 4 Expected: 4
Tips on how to stay on / keep attacking champions that blink untargetable Actual: 6 Expected: 6
Looking for some Shyvana tips Actual: 20 Expected: 20
ADC losing late game Actual: 2 Expected: 2
what should I improve if i want to achieve master tier ? Actual: 2 Expected: 2
Most effective bot lane combos for duos/coordinated players? Actual: 16 Expected: 16
Post your OP.GG seeking advice! Actual: 0 Expected: 0
In lower ELO, why do teams group mid so fast? Actual: 58 Expected: 58
How to use Kled unmounted? Actua

The counts are mostly right, some are a bit off. Couple of reasons:

1. Sometimes more comments are posted after I pull the intial count.
2. If a thread is especially long, the comments are stored in a "continue thread" link, which I'm not particularly interested in.