# Web Scraping Notebook
I want to get a list of the topics on the reddit front page and also grab the comments for each thread.

In [47]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import time

In [53]:
# This is the webpage I want to scrape
url = requests.get('http://www.reddit.com/r/politics/')

# Load in the HTML data
data = url.text
soup = BeautifulSoup(data,"lxml")

Turns out the section I want actually has an ID associated, so we can just go ahead and call it straight from the Soup!

In [54]:
# Jump straight to site table
site_table = soup.find("div",{"id":"siteTable"})

# Site_table contains an extra div object in between all the threads, so I want to skip over that. That's why I'm selecting
# only divs with 'thing' in the class, since those pertain directly to threads
site_table2 = site_table.findAll("div",class_=lambda x: x and 'thing' in x)

# Now loop through each thread and retrieve the thread title, comment URL and number of comments.
titles = []
links = []
comment_count = []
for thread in site_table2:
    # Grab the titles from the title paragraph
    title = thread.find("p",class_="title").a.string
    titles.append(title)
    # Comment data is located in the "flat-list buttons" list
    ul = thread.find("ul",class_="flat-list buttons")
    comment_url = ul.a['href']
    num_comments = ul.a.string
    links.append(comment_url)
    comment_count.append(num_comments)

# Extract the just the number of comments for each thread
comment_count = [re.match('(.*?)\s',x) for x in comment_count]

# If there are no comments, the match object will be NoneType. We want to convert that to a 0 and then convert the
# counts to integers.
for i, count in enumerate(comment_count):
    if count == None:
        comment_count[i] = 0
    else:
        comment_count[i] = int(count.group(0).strip())

data = pd.DataFrame({'Title':titles,'Number of Comments':comment_count,'URL':links})

# Get Comments
Now I want to grab the comments for each of the threads.

In [58]:
data['URL'][0]

'https://www.reddit.com/r/politics/comments/4uux1i/2016_democratic_national_convention_day_3/'

In [68]:
results = {}
for i, url in enumerate(data['URL']):
    comment_page = requests.get(url)
    page_data = comment_page.text
    soup = BeautifulSoup(page_data,'lxml')
    
    sitetable = soup.find("div",class_="sitetable nestedlisting")
    results[data['Title'][i]] = sitetable
    time.sleep(5)

In [69]:
results

{'"Bernie supporters say they\'re prepared to cause a scene Wednesday night"': <div class="sitetable nestedlisting" id="siteTable_t3_4ux3p8"><div class=" thing id-t1_d5tknfb noncollapsed comment score-hidden " data-author="Tlehmann22" data-author-fullname="t2_pyy2j" data-fullname="t1_d5tknfb" data-subreddit="politics" data-subreddit-fullname="t5_2cneq" data-type="comment" id="thing_t1_d5tknfb" onclick="click_thing(this)"><p class="parent"><a name="d5tknfb"></a></p><div class="midcol unvoted"><div aria-label="upvote" class="arrow up login-required access-required" data-event-action="upvote" role="button" tabindex="0"></div><div aria-label="downvote" class="arrow down login-required access-required" data-event-action="downvote" role="button" tabindex="0"></div></div><div class="entry unvoted"><p class="tagline"><a class="expand" href="javascript:void(0)" onclick="return togglecomment(this)">[–]</a><a class="author may-blank id-t2_pyy2j" href="https://www.reddit.com/user/Tlehmann22">Tlehm