In [8]:
## source code and tutoral taken from https://www.dataquest.io/blog/web-scraping-tutorial-python/

## requests library documentation - https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import requests
## beautiful soup library documentation - https://www.crummy.com/software/BeautifulSoup/
from bs4 import BeautifulSoup

## creating a page object and calling a webpage
page = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
page


<Response [200]>

In [6]:
## a status of '200' means that the page downloaded successfully
page.status_code

200

In [7]:
##prints out the content of the page in html
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [12]:
## BeautifulSoup can parse an html page. Compare with the plain html above
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [15]:
#prettify method formats the BeautifulSoup object
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [17]:
##selects the elements of the page one by one which can be iterated on
## returns a list object, which is why we call the list function
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [18]:
## iterates over the soup object and returns the types of each element
[type(item) for item in soup.children]

## doctype objects contains information about the type of document (this is html)
## navigable string objects represents text found in the html document (this is \n)
## A Tag object corresponds to an XML or HTML tag in the original document (this is all the rest)


[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [36]:
html = list(soup.children)[2]
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [44]:
## list(html.children) shows all the elements of html object 
## assigning the body tags to the new body object
body = list(html.children)[3]

In [52]:
## now we can get the p tags within the body by listing body's children
p = list(body.children)[1]
p

<p>Here is some simple content for this page.</p>

In [49]:
## once we have isolated the p tags, we can get the text with get_text()
p.get_text()

'Here is some simple content for this page.'

In [56]:
## all the cells and steps above illustrates how to find the navigation structure of the web page, 
## but this is time intensive
## instead, we can use the find_all method which iterates over the entire beautiful soup object 
## and returns a list of all the tags that we specified
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [57]:
## with the list of all 'p' tags, we can cal the get_text method
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [59]:
## if we only wanted to find the first instance of a p tag, we can use find()
soup.find('p')

<p>Here is some simple content for this page.</p>