# Reddit API Web Scraper
I started building a true web scraper using BeautifulSoup, and later discovered Reddit has an API to grab content, so I'm going to take advantage of that instead.

In [1]:
import pandas as pd
import numpy as np
import time
import requests
import json
from tqdm import tqdm_notebook

# Local file that stores my credentials for accessing the API
from reddit_oauth import username, password, client_id, client_secret

In [2]:
def check_more(comment):
    if comment['kind']=='more':
        if comment['data']['parent_id'][0:2]=="t3":
            v['more'].extend(tuple(comment['data']['children']))
        return True

def get_more_children(parent,comment):
    more_comment_ids = comment['data']['children']
    #more_comments = requests.get('https://oauth.reddit.com/api/morechildren',headers=headers,
    #                    params={'children':','.join(more_comment_ids),'link_id':link_id})
    more_comments = requests.get('https://oauth.reddit.com/api/morechildren',headers=headers,
                        params={'children':more_comment_ids,'link_id':link_id})
    more_comments = more_comments.json()['jquery'][14][3][0]
    for more in more_comments:
        if check_more(more):
            if more['data']['count']==0:
                continue
            get_more_children(parent,more)
            continue
        parent.append(more['data']['body'].lower())
        get_children(parent,more)

def get_children(parent,comment):
    if comment['data']['replies'] != '':
        children = comment['data']['replies']['data']['children']
        for child in children:
            if check_more(child):
                if child['data']['count']==0:
                    continue
                get_more_children(parent,child)
                continue
            else:
                parent.append(child['data']['body'].lower())
                get_children(parent,child)
            
def get_parents(comments):
    for each in tqdm_notebook(comments,desc='Parents',leave=False):
        if check_more(each): 
            continue
        parent = [each['data']['body'].lower()]
        v['comments'].append(parent)
        get_children(parent,each)

def get_more_parents(more_parents):
    for parent_id in tqdm_notebook(more_parents,desc='More Parents',leave=False):
        parent_comment = requests.get(url,headers=headers,params={'comment':parent_id}) 
        parent_comments = parent_comment.json()[1]['data']['children']
        get_parents(parent_comments)
        time.sleep(0.3)        

In [3]:
client_auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
post_data = {"grant_type": "password", "username": username, "password": password,"redirect_uri":'http://localhost'}
user_agent = {"User-Agent": "Comment Scraper app by /u/" + username}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=user_agent)

headers = {"Authorization": response.json()['token_type'] + " " + response.json()['access_token'],
           "User-Agent": "Comment Scraper app by /u/" + username}

In [4]:
subreddit = 'summonerschool'
num_pages = 25

# Create skeleton dictionary with thread titles as keys.
master = {}
after = None

for pages in range(num_pages):
    # This is the webpage I want to scrape. For this intial scrape, I want to make sure
    # I am updating my user-agent because I'm not accessing the reddit API just yet.
    url = 'http://www.reddit.com/r/' + subreddit + '/.json'
    main_json = requests.get(url,headers=user_agent,params={'after':after})
    after = main_json.json()['data']['after']

    for each in main_json.json()['data']['children']:
        master.setdefault(each['data']['title'],{'id':each['data']['id'],'json':[],'comments':[],'more':[],'expected_comments':each['data']['num_comments'],'actual_comments':0})   

for k,v in tqdm_notebook(master.items(),desc='Thread',leave=False):
    
    url = 'https://oauth.reddit.com/r/' + subreddit + '/comments/' + v['id']
    v['json'] = requests.get(url,headers=headers)
    
    if v['json'].status_code!=200:
        print("JSON Retrieval Error")
        raise KeyboardInterrupt
   
    link_id = v['json'].json()[0]['data']['children'][0]['data']['name']
    parents = v['json'].json()[1]['data']['children']
    get_parents(parents)
    get_more_parents(v['more'])
    
    for comments in v['comments']:
        v['actual_comments'] += len(comments)



Check how many comments I have for each thread

In [5]:
for k, v in master.items():
    print(k,"Actual:",v['actual_comments'],"Expected:",v['expected_comments'])

Could Varus ADC work right now? Actual: 41 Expected: 45
Question on jungler play-styles... Actual: 11 Expected: 11
How do you defend turrets as a full melee champion? Actual: 21 Expected: 20
Tips on how to stay on / keep attacking champions that blink untargetable Actual: 6 Expected: 6
Champion Discussion of the Day: Kassadin Actual: 22 Expected: 24
I often don't know what to do with myself past laning. Actual: 12 Expected: 12
Need build help for AP Ezreal Support Actual: 12 Expected: 12
Why do people not run armor quints vs AD mids? Actual: 3 Expected: 3
How to teach yourself when you can win a trade or all in? Actual: 25 Expected: 25
Why do people smite gromp to kill rather than to start? Actual: 19 Expected: 19
When to shove, freeze, or keep the lane in the middle looking at matchups Actual: 31 Expected: 31
Any Shaco advice? Actual: 24 Expected: 24
Questions on MMR Actual: 7 Expected: 7
Am I holding myself back by playing Yasuo? Actual: 77 Expected: 84
Wich Support Do You Recommend 

The counts are mostly right, some are a bit off. Couple of reasons:

1. Sometimes more comments are posted after I pull the intial count.
2. If a thread is especially long, the comments are stored in a "continue thread" link, which I'm not particularly interested in.

## Save file
I'll save the text data just so I don't have to run this again unless I want more data. I'm going to dump it into a json with just the thread name and comments

In [6]:
save = {}
for k,v in master.items():
    save.setdefault(k,v['comments'])

In [7]:
json.dump(save,open('thread_comments_25pg.txt','w'))