In [1]:
# Our jupyter/datascience-notebook Docker container comes with 
# BeautifulSoup4 and requests, both popular libraries!

from bs4 import BeautifulSoup
import requests

In [2]:
START_URL = 'https://brickset.com/sets/year-2016'

In [3]:
page = requests.get(START_URL)

In [4]:
page.content

b'\n<!DOCTYPE html>\n<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!-->\n<html class="no-js" lang="en">\n<!--<![endif]-->\n<head>\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\n<meta http-equiv="content-type" content="text/html; charset=utf-8" />\n<meta name="description" content="LEGO set database: 2016 " />\n<meta name="viewport" id="viewport" content="width=device-width, minimum-scale=1.0, maximum-scale=1.0" />\n<link rel="alternate" type="application/rss+xml" title="Brickset news and activity feed" href="//brickset.com/feed/" />\n<link rel="canonical" href="https://brickset.com/sets/year-2016" />\n<meta name="temp" content="New server" />\n<link rel="apple-touch-icon" sizes="57x57" href="/assets/images/icons/apple-icon-57x57.png">\n<link rel="apple-touch-icon" sizes="60x60" href="/asset

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
soup


<!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
<!--<![endif]-->
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="LEGO set database: 2016 " name="description"/>
<meta content="width=device-width, minimum-scale=1.0, maximum-scale=1.0" id="viewport" name="viewport"/>
<link href="//brickset.com/feed/" rel="alternate" title="Brickset news and activity feed" type="application/rss+xml"/>
<link href="https://brickset.com/sets/year-2016" rel="canonical"/>
<meta content="New server" name="temp"/>
<link href="/assets/images/icons/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
<link href="/assets/images/icons/apple-icon-60x60.png" rel="apple-touch-icon" s

In [7]:
raw_titles = soup.select('h1 a')

In [8]:
titles = [t.text for t in raw_titles] 

In [9]:
titles

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane']

In [10]:
# Exercise #1: Get the titles for each "brickset" on the first page

def get_titles(soup):    
    """ Returns a list of titles on the page """
    # the "soup" parameter is of the type that is
    # returned by Beautiful Soup when it parses HTML.
    # The function should then use the object to
    # extract a list of titles (of the lego sets)
    #
    # Lookup the documentation for Beautiful Soup
    # Figure out how to select the text of the title
    # of each legoset. A title should look like: 
    # "10252: Volkswagen Beetle"
    
    raw_titles = soup.select('h1 a')
    titles = [t.text for t in raw_titles]
    
    return titles

def parse_bricks(url):
    """ Fetches Lego Bricks page and extracts titles """
    # Lookup the documentation to the "requests" library
    #
    # Use requests to make a get request to the
    # url given in the argument "url" (which is a string)
    # and get the raw HTML body of the response
    #
    # Use "BeautifulSoup" to parse this HTML. 
    #
    # Use the "get_titles" function to extract the
    # titles from the BeautifulSoup object.
    #
    # Return the titles
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    titles = get_titles(soup)
    
    return titles

In [11]:
bricks = parse_bricks(START_URL)

In [12]:
bricks

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane']

In [13]:
assert(bricks[0] == '10251:  Brick Bank')
assert(bricks[9] == '10722:  Snake Showdown')

## Exercise 2

In [14]:
next_page = soup.select("div .pagination .next a")[0].get('href')
next_page

'https://brickset.com/sets/year-2016/page-2'

In [15]:
parse_bricks(next_page)

['10809:  Police Patrol',
 '10810:  Push Train',
 '10811:  Backhoe Loader',
 '10812:  Truck & Tracked Excavator',
 '10813:  Big Construction Site',
 '10814:  Tow Truck',
 '10815:  My First Rocket',
 '10816:  My First Cars and Trucks',
 '10817:  Creative Chest',
 '10818:  My First Truck',
 '10819:  My First Garden',
 '10820:  Creative Construction Basket',
 '10822:  Sofia the First Magical Carriage',
 "10824:  Miles' Space Adventures",
 "10825:  Miles' Exo-Flex Suit",
 "10826:  Miles' Stellosphere Hangar",
 '10827:  Mickey & Friends Beach House',
 "10828:  Doc McStuffins' Pet Vet Care",
 "10829:  Mickey's Workshop",
 "10830:  Minnie's Café",
 '10831:  My First Caterpillar',
 '11911:  City: Build Your Own Adventure parts',
 '11912:  LEGO Star Wars: Build Your Own Adventure',
 '21026:  Venice',
 '21027:  Berlin']

Now I want to explore what happens when we ask for the next page when we are in the last page

In [26]:
last_page = requests.get('https://brickset.com/sets/year-2016/page-34')

In [27]:
last_soup = BeautifulSoup(last_page.content, 'html.parser')

In [28]:
next_page_last = last_soup.select("div .pagination .next a")[0].get('href')
next_page_last

IndexError: list index out of range

In [39]:
# Exercise #2

# Now write code that gets you all the links from ALL the pages.

# HINT: you will probably want to extract the URL in the "next" button on 
# the bottom of the search pagination, which looks like ">".

# HINT HINT: Think of the previous exercise on API's and internet data.
# The Pokemon API returned JSON, that we converted to a dictionary, that
# had a nice structure. In particular, there were two top-level keys of interest, 
# one had the "results" in a list, the other was the "next" url to call to get
# more items. If you can replicate this return structure, you will be able to 
# almost reuse the while loop you had there!

# HINT HINT HINT: There's no reason you shouldn't be able to reuse the previous 
# functions (get_titles and parse_bricks)


def get_all_titles(url):
    """ Function that extracts all titles from all relevant pages
    
    Args:
        url (str): The initial URL with the titles to be extracted
        
    Returns:
        A list with all the titles from all the pages that could be accessed
    
    """
    titles = []
    finish = False
    while finish == False:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        titles.extend(get_titles(soup))
        try:
            url = soup.select("div .pagination .next a")[0].get('href')
        except:
            finish = True
            
    return titles

In [40]:
all_titles = get_all_titles(START_URL)

In [41]:
all_titles

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane',
 '10809:  Police Patrol',
 '10810:  Push Train',
 '10811:  Backhoe Loader',
 '10812:  Truck & Tracked Excavator',
 '10813:  Big Construction Site',
 '10814:  Tow Truck',
 '10815:  My First Rocket',
 '10816:  My First Cars and Trucks',
 '10817:  Creative Chest',
 '10818: 