In [2]:
from urllib.request import urlopen

html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [14]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


If you want to use a more advanced parser, you can use lxml. Note, you will have to install this first:
```
pip install lxml
```

Then you can do the following to parse the above (must restart jupyter):

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'lxml')
print(bs.h1)

<h1>An Interesting Title</h1>


Another popular parser to use is html5lib. It is, however, slower than lxml and html.parser.

It can be installed using:
```
pip install html5lib
```

Then you can do the following (must restart Jupyter first):

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html5lib')
print(bs.h1)

<h1>An Interesting Title</h1>


Handling errors:

In [7]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen('http://www.pythonscraping.com/pages/page1.html')
except HTTPError as e:
    print(e)
except URLError as e:
    print("The server could not be found!")
else:
    print("It Worked!")

It Worked!


In [16]:
# Should return None
print(bs.someBullshitTagThatDoesntExist)

#Should return an error
print(bs.someBullshitTagThatDoesntExist.someTag)

None


AttributeError: 'NoneType' object has no attribute 'someTag'

In [21]:
# Handling problem of non-existent tag
try:
    badContent = bs.someBullshit.anotherTag
except AttributeError as e:
    print("Tag was not found")
else:
    if badContent == None:
        print("Tag was not found")
    else:
        print(badContent)

Tag was not found


The same above scraper, now written in a new way:

In [38]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except (HTTPError, URLError) as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')
if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title</h1>
