In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [2]:
r= requests.get("https://keithgalli.github.io/web-scraping/example.html")

soup= bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# find and find_all

In [3]:
first_header= soup.find("h2")
headers= soup.find_all('h2')
print(first_header)
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [4]:
first_header= soup.find(["h1", 'h2'])
headers= soup.find_all(['h1', 'h2'])
print(first_header)
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


# Passing Attributes

In [5]:
paragraph= soup.find_all("p", attrs= {'id': 'paragraph-id'})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


# Nested Elements

In [6]:
body= soup.find('body')
div= body.find('div')
header= div.find('h1')
print(header)

<h1>HTML Webpage</h1>


# finding text content of an element

In [7]:
paragraph= soup.find_all('p', string= 'Some bold text')
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [8]:
import re
paragraph= soup.find_all('p', string= re.compile('Some'))
print(paragraph)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [9]:
headers= soup.find_all('h2', string= re.compile('H|header'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

# Select

In [12]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [13]:
content= soup.select('div p')
print(content)


[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]


In [16]:
paragraphs= soup.select('h2 ~ p') # all p tags preceedded by h2
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [17]:
bold_text= soup.select('p#paragraph-id b')
print(bold_text)

[<b>Some bold text</b>]


In [18]:
paragraphs= soup.select('body > p')
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [19]:
for paragraph in paragraphs:
    print(paragraph.select('i'))

[<i>Some italicized text</i>]
[]


In [20]:
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Getting Properties of an element

In [22]:
header= soup.find('h2')
print(header.string)

A Header


<p> If there are multiple elements we need to use get_text() instead of string</p>

In [23]:
div= soup.find('div')
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [24]:
a= soup.find('a')
print(a['href'])

https://keithgalli.github.io/web-scraping/webpage.html


In [28]:
paragraph= soup.select('p#paragraph-id')
print(paragraph[0]['id'])

paragraph-id


# Code Navigation

In [29]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [30]:
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [31]:
soup.body.div.h1

<h1>HTML Webpage</h1>

In [32]:
soup.body.div.h1.string

'HTML Webpage'

# terms to know
1. parent
2. Sibling
3. Child 

There are a lot of methods in documentation

In [34]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
soup.body.find('div').find_previous_siblings()

[]

In [36]:
div=soup.body.find('div')
print(div.find_parent())

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>


In [40]:
i= soup.select('i')
print(i[0].find_parent())

<p><i>Some italicized text</i></p>


In [42]:
print(i[0].find_parents())

[<p><i>Some italicized text</i></p>, <body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>, <html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>, <html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgall

# Tasks