In [1]:
!pip install requests



In [6]:
from bs4 import BeautifulSoup as bs
import requests

# Load our first page


In [7]:
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

In [10]:
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using BeautifulSoup to scrape

# find and find all (most important one imo)


In [11]:
first_header = soup.find("h2")

In [12]:
first_header

<h2>A Header</h2>

In [13]:
headers = soup.find_all("h2")

In [14]:
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [15]:
# Pass in a list of elements to look for

first_headers = soup.find(["h1", "h2"])
first_headers

<h1>HTML Webpage</h1>

In [17]:
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [20]:
# pass in attributes to the find/find_all function

paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [21]:
# you can nest find and find_all calls

body = soup.find('body')
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [22]:
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [24]:
# We can search for specific strings in our find/find_all calls
import re

paragraph = soup.find_all("p", string=re.compile('Some'))
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [27]:
headers = soup.find_all('h2', string=re.compile('(H|h)eader'))

In [28]:
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [32]:
# select (CSS Selector)
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [33]:
paragraphs = soup.select("h2~p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [34]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [35]:
paragraphs = soup.select("body > p")
paragraphs


[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [37]:
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


In [40]:
print(soup.prettify())


<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Get different properties of HTML 

In [42]:
header=soup.find('h2')
header.string

'A Header'

In [44]:
div = soup.find('div')
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [45]:
# Get a specific property from an element
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [47]:
paragraphs = soup.select('p#paragraph-id')
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [48]:
paragraphs[0]['id']

'paragraph-id'

# Code navigation

In [50]:
soup.body.div.h1.string

'HTML Webpage'

In [53]:
# Know the terms

# parent, sibling, child

soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]