# Web Scraping Notebook
I want to get a list of the topics on the reddit front page and also grab the comments for each thread.

In [17]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [18]:
# This is the webpage I want to scrape
url = requests.get('http://www.reddit.com/r/politics/')

# Load in the HTML data
data = url.text
soup = BeautifulSoup(data,"lxml")

Turns out the section I want actually has an ID associated, so we can just go ahead and call it straight from the Soup!

In [25]:
# Jump straight to site table
site_table = soup.find("div",{"id":"siteTable"})

# Site_table contains an extra div object in between all the threads, so I want to skip over that. That's why I'm selecting
# only divs with 'thing' in the class, since those pertain directly to threads
site_table2 = site_table.findAll("div",class_=lambda x: x and 'thing' in x)

# Now loop through each thread and retrieve the thread title, comment URL and number of comments.
titles = []
links = []
comment_count = []
for thread in site_table2:
    # Grab the titles from the title paragraph
    title = thread.find("p",class_="title").a.string
    titles.append(title)
    # Comment data is located in the "flat-list buttons" list
    ul = thread.find("ul",class_="flat-list buttons")
    comment_url = ul.a['href']
    num_comments = ul.a.string
    links.append(comment_url)
    comment_count.append(num_comments)

# Extract the just the number of comments for each thread
comment_count = [re.match('(.*?)\s',x) for x in comment_count]

# If there are no comments, the match object will be NoneType. We want to convert that to a 0 and then convert the
# counts to integers.
for i, count in enumerate(comment_count):
    if count == None:
        comment_count[i] = 0
    else:
        comment_count[i] = int(count.group(0).strip())

data = pd.DataFrame({'Title':titles,'Number of Comments':comment_count,'URL':links})