In [6]:
# Parsing and Navigating HTML
# BeautifulSoup(html_string, 'html.parser') - parse HTML
# Can parse by Tag Name
# Use "Find" - returns one matching tag
# Use "find_all" - returns a list of matching tags
# Navigate with CSS selectors


In [5]:
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

In [8]:
soup = BeautifulSoup(html, 'html.parser')

In [10]:
type(soup)

bs4.BeautifulSoup

In [11]:
print(soup.body)

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>


In [12]:
print(soup.body.div)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [13]:
print(soup.find('div'))

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [14]:
# find_all returns a list
d = soup.find_all('div')
print(d)

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


In [15]:
d = soup.find(id='first')
print(d)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [16]:
# when finding classes in html, must us "class_"
d = soup.find_all(class_='special')
print(d)

[<li class="special">This list item is special.</li>, <li class="special">This list item is also special.</li>]


In [17]:
# based on attributes - dictionary
d = soup.find_all(attrs={'data-example': 'yes'})
print(d)

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]


# CSS Style Selectors

In [18]:
# select - returns a lsit of elements matching a css selector
# select by id of foo: #foo
# select by class of bar: .bar
# select childre: div > p
# select descendents: div p

In [23]:
soup


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>First HTML Page</title>
</head>
<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>
</html>

In [22]:
d = soup.select("#first")
print (d)

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>]


In [24]:
d = soup.select("#first")[0]
print (d)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [25]:
d = soup.select("div")
print (d)

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


# Accessing Data in Elements

In [26]:
# get_text - access the inner text in an element
# name - tag name
# attrs - dictionary attributes
# also access attribute values using brackets

In [27]:
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

In [28]:
soup = BeautifulSoup(html, 'html.parser')

In [32]:
el = soup.select('.special')[0]
print(el.get_text())

This list item is special.


In [39]:
for el in soup.select('.special'):
    print(el.name)

li
li


In [40]:
for el in soup.select('.special'):
    print(el.attrs)

{'class': ['special']}
{'class': ['special']}


In [44]:
attr = soup.find('h3')['data-example']
print(attr)

yes


In [45]:
attr = soup.find('div')['id']
print(attr)

first


# Navigating with Beautiful Soup

In [47]:
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

In [48]:
soup = BeautifulSoup(html, 'html.parser')

In [50]:
data = soup.body.contents
print(data)

['\n', <div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, '\n', <ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>, '\n', <div data-example="yes">bye</div>, '\n']


In [54]:
data = soup.body.contents[1]
print(data)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [56]:
data = soup.body.contents[1].next_sibling.next_sibling
print(data)

<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>


In [59]:
#skips spaces
data = soup.find(id='first').find_next_sibling()
print(data)

<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>


In [60]:
data = soup.select('[data-example]')

# Web Scraping Example with Beautiful Soup

In [63]:
import requests
from bs4 import BeautifulSoup
import csv

In [64]:
response = requests.get('https://www.rithmschool.com/blog')
print(response.text)

<!DOCTYPE html>
<html lang="en">
<head>

  <meta property="og:image" content="https://www.rithmschool.com/assets/logos/rithm_logo-52c2ff4eb53876f905ff2d8b1d46b5ec737caa4d9f9acf4790dcd856f3ccc638.svg"/>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="description" content="Spend 16 weeks learning JavaScript, Python, and React with 8-15 other students. Learn quickly with experienced instructors. No tuition until you&#39;re hired">
  <meta name="author" content="Rithm School">


  <title>The Rithm Blog</title>



  <link rel="stylesheet" media="all" href="/assets/application-84f90791da100b26ea40e9ac25a82c79691a15378b836bb8ed22a886c6a23e0a.css" />

  <!-- Add a stylesheet tag for the name of the action -->
  <link rel="stylesheet" media="screen" href="/assets/main-22cbb9ba12f0cba7c0bdefe70dac7638d3203e8c67d7e7088519c361e64a481e.css" />
  <link rel="stylesheet" media="screen"

In [65]:
soup = BeautifulSoup(response.text, 'html.parser')

In [67]:
articles = soup.find_all('article')
print(articles)

[<article>
<h4 class="section-heading">
<a href="/blog/alumni-spotlight-tyler">Alumni Spotlight: Tyler Ketron</a>
</h4>
<div class="card">
<time datetime="2019-01-30" pubdate=""></time>
<p>Before joining us for Rithm’s 7th cohort, Tyler Ketron got his hands dirty (and sometimes still misses it!) as a hydrogeologist at government agencies and a consulting firm. When he’s not busy at his new job as a Front End Engineer, you can find Tyler competitively weight lifting, raising chickens in his backyard, or flexing his classical musician skills with the oboe and piano. Luckily, we were able to catch some time to sit down with Tyler and discuss his journey from growing up in a small town in Tennessee to becoming a software engineer.</p>
<p><a href="/blog/alumni-spotlight-tyler">Continue Reading</a></p>
<h4 class="service-heading"><small>January 30, 2019</small></h4>
</div>
</article>, <article>
<h4 class="section-heading">
<a href="/blog/avoiding-burnout">Avoiding Burnout</a>
</h4>
<div clas

In [70]:
for article in articles:
    print(article.find('a').get_text())

Alumni Spotlight: Tyler Ketron
Avoiding Burnout
Student Interview: Sarah + Zac on Company Projects
A New Tuition Option: We Make Money Only When You Do
MongoDB is Easy
Meet The Instructor: Alissa Renz
Error Handling in Express
Four Tips for Moving Faster as a Developer
Five Tips to Manage the Emotional Side of a Coding Program
Does Emotional Intelligence Get You Paid More in Tech?


In [73]:
for article in articles:
    title = article.find('a').get_text()
    url = article.find('a')['href']
    date = article.find('time')['datetime']
    print (title, url, date)

Alumni Spotlight: Tyler Ketron /blog/alumni-spotlight-tyler 2019-01-30
Avoiding Burnout /blog/avoiding-burnout 2019-01-09
Student Interview: Sarah + Zac on Company Projects /blog/student-interview-sarah-zac 2018-12-18
A New Tuition Option: We Make Money Only When You Do /blog/new-tuition-option 2018-12-11
MongoDB is Easy /blog/mongodb-is-easy 2018-12-06
Meet The Instructor: Alissa Renz /blog/meet-the-instructor-alissa-renz 2018-11-14
Error Handling in Express /blog/error-handling-express 2018-11-05
Four Tips for Moving Faster as a Developer /blog/developer-productivity 2018-10-16
Five Tips to Manage the Emotional Side of a Coding Program /blog/five-tips-emotional-management 2018-10-08
Does Emotional Intelligence Get You Paid More in Tech? /blog/emotional-intelligence 2018-10-03


In [76]:
from csv import writer, reader

with open('blog_data.csv', 'w') as csv_file:
    csv_writer = writer(csv_file)
    csv_writer.writerow(['title', 'link', 'date'])
    for article in articles:
        title = article.find('a').get_text()
        url = article.find('a')['href']
        date = article.find('time')['datetime']
        csv_writer.writerow([title, url, date])

In [77]:
with open('blog_data.csv', 'r') as csv_file:
    csv_reader = reader(csv_file)
    for row in csv_reader:
        print(row)

['title', 'link', 'date']
[]
['Alumni Spotlight: Tyler Ketron', '/blog/alumni-spotlight-tyler', '2019-01-30']
[]
['Avoiding Burnout', '/blog/avoiding-burnout', '2019-01-09']
[]
['Student Interview: Sarah + Zac on Company Projects', '/blog/student-interview-sarah-zac', '2018-12-18']
[]
['A New Tuition Option: We Make Money Only When You Do', '/blog/new-tuition-option', '2018-12-11']
[]
['MongoDB is Easy', '/blog/mongodb-is-easy', '2018-12-06']
[]
['Meet The Instructor: Alissa Renz', '/blog/meet-the-instructor-alissa-renz', '2018-11-14']
[]
['Error Handling in Express', '/blog/error-handling-express', '2018-11-05']
[]
['Four Tips for Moving Faster as a Developer', '/blog/developer-productivity', '2018-10-16']
[]
['Five Tips to Manage the Emotional Side of a Coding Program', '/blog/five-tips-emotional-management', '2018-10-08']
[]
['Does Emotional Intelligence Get You Paid More in Tech?', '/blog/emotional-intelligence', '2018-10-03']
[]


# Web Scraping Project

In [None]:
# scrapes http://quotes.toscrape.com
# text of the quote, name of the person, href of the link to the persons bio
# Create a file called `scraping_project.py` which, when run, grabs data on every quote from the website http://quotes.toscrape.com

# You can use `bs4` and `requests` to get the data. For each quote you should grab the text of the quote, the name of the person who said the quote, and the href of the link to the person's bio. Store all of this information in a list.

# Next, display the quote to the user and ask who said it. The player will have four guesses remaining.

# After each incorrect guess, the number of guesses remaining will decrement. If the player gets to zero guesses without identifying the author, the player loses and the game ends. If the player correctly identifies the author, the player wins!

# After every incorrect guess, the player receives a hint about the author. 

# For the first hint, make another request to the author's bio page (this is why we originally scrape this data), and tell the player the author's birth date and location.

# The next two hints are up to you! Some ideas: the first letter of the author's first name, the first letter of the author's last name, the number of letters in one of the names, etc.

In [39]:
import requests
from bs4 import BeautifulSoup


quote_list = []
author_info = {}


def scraper():
    global response
    response = requests.get('http://quotes.toscrape.com')
    
    global soup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    scrape_quotes()
    
    page_counter = 0
    
    # This loops through pages 2 -> end
    while soup.find(class_='next') is not None:
        page_counter += 1
        response = requests.get('http://quotes.toscrape.com/page/' 
                                + str(page_counter))
        soup = BeautifulSoup(response.text, 'html.parser')
        scrape_quotes()
        

# Scrapes everything on a page and puts into a list        
def scrape_quotes():
    
    page = soup.find_all(class_ = 'quote')
    
    for item in page:
        text = item.find(class_ = 'text').get_text()
        author = item.find(class_ = 'author').get_text()
        global link
        link = 'http://quotes.toscrape.com' + item.find('a')['href']
        quote_list.append([text, author, link])
        
        scrape_author()
        
# Scrapes the author page and puts into dictionary, where author is the key
# take a minute or two to run
def scrape_author():
    author_response = requests.get(link)
    author_soup = BeautifulSoup(author_response.text, 'html.parser')
    
    author = author_soup.find(class_='author-title').get_text()   #author
    born = author_soup.find(class_='author-born-date').get_text()   #clue 1
    first_initial = author[0]        # clue 2    
    split_name = author.split()
    last_name = split_name[(len(split_name)-1)]     # clue 3
    
    if author not in author_info:
        author_info[author]= ([author, born, first_initial, last_name])
    else:
        pass

In [40]:
scraper()

In [41]:
quote_list

[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
  'Albert Einstein',
  'http://quotes.toscrape.com/author/Albert-Einstein'],
 ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
  'J.K. Rowling',
  'http://quotes.toscrape.com/author/J-K-Rowling'],
 ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
  'Albert Einstein',
  'http://quotes.toscrape.com/author/Albert-Einstein'],
 ['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
  'Jane Austen',
  'http://quotes.toscrape.com/author/Jane-Austen'],
 ["“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
  'Marilyn Monroe',
  'http://quotes.toscrape.com/author/Marilyn-Monroe'],
 ['“Try not to become a man of success. Rather become a 

In [42]:
author_info

{'Albert Einstein': ['Albert Einstein', 'March 14, 1879', 'A', 'Einstein'],
 'J.K. Rowling': ['J.K. Rowling', 'July 31, 1965', 'J', 'Rowling'],
 'Jane Austen': ['Jane Austen', 'December 16, 1775', 'J', 'Austen'],
 'Marilyn Monroe': ['Marilyn Monroe', 'June 01, 1926', 'M', 'Monroe'],
 'André Gide': ['André Gide', 'November 22, 1869', 'A', 'Gide'],
 'Thomas A. Edison': ['Thomas A. Edison', 'February 11, 1847', 'T', 'Edison'],
 'Eleanor Roosevelt': ['Eleanor Roosevelt',
  'October 11, 1884',
  'E',
  'Roosevelt'],
 'Steve Martin': ['Steve Martin', 'August 14, 1945', 'S', 'Martin'],
 'Bob Marley': ['Bob Marley', 'February 06, 1945', 'B', 'Marley'],
 'Dr. Seuss': ['Dr. Seuss', 'March 02, 1904', 'D', 'Seuss'],
 'Douglas Adams': ['Douglas Adams', 'March 11, 1952', 'D', 'Adams'],
 'Elie Wiesel': ['Elie Wiesel', 'September 30, 1928', 'E', 'Wiesel'],
 'Friedrich Nietzsche': ['Friedrich Nietzsche',
  'October 15, 1844',
  'F',
  'Nietzsche'],
 'Mark Twain': ['Mark Twain', 'November 30, 1835', 'M'

In [86]:
import random

def play_game():
    quote_num = random.randint(0, (len(quote_list)-1))
    print(quote_list[quote_num][0])
    auth_name = quote_list[quote_num][1]
    user_input = input('Guess who made the quote: \n')
    if user_input == auth_name:
        print ('Congrats you are right')
    else:
        print(f'Try again, the author was born: {author_info[auth_name][1]}')
        user_input = input('Guess #2: \n')
        if user_input == auth_name:
            print ('Congrats you are right')
        else:
            print(f'Try again, the authors first initial is: {author_info[auth_name][2]}')
            user_input = input('Guess #3: \n')
            if user_input == auth_name:
                print ('Congrats you are right')
            else:
                print(f'Try again, the authors last name is: {author_info[auth_name][3]}')
                user_input = input('Guess #4: \n')
                if user_input == auth_name:
                    print ('Congrats you are right')
                else:
                    print ('Sorry play again')

In [87]:
play_game()

“I am free of all prejudice. I hate everyone equally. ”
Guess who made the quote: 
a
Try again, the author was born: January 29, 1880
Guess #2: 
a
Try again, the authors first initial is: W
Guess #3: 
a
Try again, the authors last name is: Fields
Guess #4: 
a
Sorry play again


In [88]:
play_game()

“To the well-organized mind, death is but the next great adventure.”
Guess who made the quote: 
a
Try again, the author was born: July 31, 1965
Guess #2: 
a
Try again, the authors first initial is: J
Guess #3: 
J.K. Rowling
Congrats you are right


# Scrapy

In [1]:
# need to install
import scrapy

In [None]:
class BookSpider(scrapy.Spider):
    name = 'bookspider'
    start_urls = ['http://books.toscrape.com/']
    
    def parse(self, response):
        for article in response.css('article.product_pod')
            yield {
                'price': article.css('.price_color::text').extract_first()
                
            }
        