# Set-up and Workflow

### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [3]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Exporting the HTML to a file

In [5]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('Wiki_response.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))


# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Searching and navigating the HTML tree

## Searching - find() and find_all()

In [5]:
# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document
soup;

In [6]:
# We can search by tag name
# This returns as the element with all its contents and nested elements inside
soup.find('head');

In [8]:
# If there is no result it returns None
# Note: None is not displayed in IPython unless print() or repr() is used
soup.find('video')

In [9]:
# Display the None value
print(soup.find('video'))

None


In [10]:
# verify the type of output
type(soup.find('video'))

NoneType

In [11]:
# .find() returns only the first such result
soup.find('a')

<a id="top"></a>

In [7]:
# If we want all the results we use find_all() 
links = soup.find_all('a')
links;

In [13]:
# find_all returns a list of all results
isinstance(links, list)

True

In [14]:
# We must be careful when using find_all()
# If no result is found it returns an empty list
soup.find_all('video')

[]

In [15]:
# How many links are on the page?
len(links)

2353

In [9]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable
table = soup.find('tbody')

In [10]:
# Inspect the value of the variable
table;

In [18]:
# Inspect the type of the variable
type(table)

bs4.element.Tag

In [11]:
# A tag can be searched in the same way we search the whole document
table.find_all('td');

In [20]:
# Since we used find_all, the result is a list
len(table.find_all('td'))

4

## Navigating the tree

In [12]:
# A tag's children are stored in a list, accessed with .contents
table.contents;

In [22]:
len(table.contents)

5

In [13]:
table.contents[1];

In [14]:
# We can also go up the tree with .parent
table.parent;

In [15]:
# table.parent is also a tag
# Thus, we can use .parent on it as well
table.parent.parent;

In [26]:
# We use .parent to go up the tree
# But what about .children?
table.children

<list_iterator at 0x2c86787d9e8>

In [17]:
# If we want a list of an element's children, we need to use table.contents as shown before
# .children is an iterator over that list, 
# which means we can use it in a for loop to iterate over all the children

for child in table.children:
    print(child)

<tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr>
<tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Fran%C3%A7ois_Boucher,_Allegory_of_Music,_1764,_NGA_32680.jpg"><img alt="François Boucher, Allegory of Music, 1764, NGA 32680.jpg" data-file-height="3189" data-file-width="4000" decoding="async" height="175" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_32680.jpg/220px-Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_32680.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_32680.jpg/330px-Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_32680.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_32680.jpg/440px-Fran%C3%A7ois_Boucher%2C_Allegory_of_Music%2C_1764%2C_NGA_3

## Searching by attributes

In [28]:
# We can search for tags based on their attributes, in addition to their name
soup.find('div', id = 'siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

In [29]:
# There are two ways in which we can do that:

### Passing attributes as function parameters

In [30]:
# By writing them as function parameters
# Notice that since class is a reserved word, we write class_
soup.find_all('a', class_ = 'mw-jump-link')

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>]

In [31]:
# We can filter against multiple attributes at once
soup.find('a', class_ = 'mw-jump-link', href = '#p-search')

<a class="mw-jump-link" href="#p-search">Jump to search</a>

### Placing the attributes in a dictionary

In [32]:
# By writting the attributes in a dictionary
soup.find('a', attrs={ 'class':'mw-jump-link', 'href':'#p-search' })

<a class="mw-jump-link" href="#p-search">Jump to search</a>

In [18]:
soup.find('div', {'id' : 'footer'});

# Extracting data from the HTML tree

In [34]:
# Let's use some placeholder object to manipulate in the examples below
a = soup.find('a', class_ = 'mw-jump-link')
a

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

In [35]:
# We can obtain the name of the tag with the .name attribute
a.name

'a'

## Getting the attribute value

In [36]:
# We can access a tag’s attributes by treating the tag just like a dictionary

In [37]:
# First way
a['href']

'#mw-head'

In [38]:
# Notice how multi-valued attributes, such as class, return a list
a['class']

['mw-jump-link']

In [39]:
# Second way
a.get('href')

'#mw-head'

In [40]:
# Again, class returns a list
a.get('class')

['mw-jump-link']

#### Differences between these methods manifest when the key is missing

In [41]:
# tag['missing-key'] returns an error
# a['id'] will raise an error, if uncommented

In [42]:
# tag.get('missing-key') returns a default value None
a.get('id')

In [43]:
# We can use repr() function to display all special characters and combinations (None, \n...)
repr(a.get('id'))

'None'

In [44]:
# We can also get all attribute name-value pairs in a dictionary
a.attrs

{'class': ['mw-jump-link'], 'href': '#mw-head'}