# Set-up and Workflow

### Importing the packages

In [3]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [4]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [5]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

### Making the soup

In [6]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Exporting the HTML to a file

In [7]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('Wiki_response.pdf', 'wb') as file:
    file.write(soup.prettify('utf-8'))


# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Searching and Navigating the html tree with find() and find_all() by attribute

In [8]:
soup.find("head")

<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"09609819-0420-43f8-b961-75d4bf319b0e","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":1119417309,"wgRevisionId":1119417309,"wgArticleId":18839,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Ancient Greek (to 1453)-language text","Webarchive template wayback links","Pages containing links to subscription-only content","Wikipedia articles needing time reference citations from February 2020","CS1: Julian–Gregori

In [10]:
#None is returned when the find() does not see the specified tag
soup.find("video")

In [11]:
#find returns the first of such tag its meets

soup.find("a")

<a id="top"></a>

In [13]:
#All of such tag can be obtained using the find_all()

links = soup.find_all("a")

In [14]:
len(links)

2471

In [41]:
table1 = soup.find('tbody')



In [38]:
# Inspect the type of the variable
type(table)

bs4.element.Tag

In [56]:
#we can navigate through a tag

table2 = table1.find_all("td")
len(table2)

6

# Navigating the tree

In [59]:
# A tag's children are stored in a list, accessed with .contents
table1.contents

[<tr><td class="mbox-image"><div class="mbox-image-div"><img alt="" data-file-height="40" data-file-width="40" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" width="40"/></div></td><td class="mbox-text"><div class="mbox-text-span"><div class="multiple-issues-text mw-collapsible"><b>This article has multiple issues.</b> Please help <b><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve it</a></b> or discuss these issues on the <b><a href="/wiki/Talk:Music" title="Talk:Music">talk page</a></b>. <small><i>(<a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove these template messages

In [63]:
table1.parent

<table class="box-Multiple_issues plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><img alt="" data-file-height="40" data-file-width="40" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" width="40"/></div></td><td class="mbox-text"><div class="mbox-text-span"><div class="multiple-issues-text mw-collapsible"><b>This article has multiple issues.</b> Please help <b><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve it</a></b> or discuss these issues on the <b><a href="/wiki/Talk:Music" title="Talk:Music">talk page</a></b>. <small><i>(<a href

# Searching by attibute

In [64]:
soup.find('div', id = 'siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

In [67]:
soup.find_all('a', class_ = 'mw-jump-link')

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>]