# Set-up and Workflow

### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [3]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Exporting the HTML to a file

In [5]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('Wiki_response.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))


# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Searching and navigating the HTML tree

## Searching - find() and find_all()

In [6]:
# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xh0NaApAAEYAAEMYOjcAAAAY","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":932061087,"wgRevisionId":932061087,"wgArticleId":18839,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles needing more detailed references","CS1 maint: archived copy as title","CS1: Julian–Gregorian uncerta

In [7]:
# We can search by tag name
# This returns as the element with all its contents and nested elements inside
soup.find('head')

<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xh0NaApAAEYAAEMYOjcAAAAY","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":932061087,"wgRevisionId":932061087,"wgArticleId":18839,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles needing more detailed references","CS1 maint: archived copy as title","CS1: Julian–Gregorian uncertainty","Webarchive template wayback links","Pages containing link

In [8]:
# If there is no result it returns None
# Note: None is not displayed in IPython unless print() or repr() is used
soup.find('video')

In [9]:
# Display the None value
print(soup.find('video'))

None


In [10]:
# verify the type of output
type(soup.find('video'))

NoneType

In [11]:
# .find() returns only the first such result
soup.find('a')

<a id="top"></a>

In [12]:
# If we want all the results we use find_all() 
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-hei

In [13]:
# find_all returns a list of all results
isinstance(links, list)

True

In [14]:
# We must be careful when using find_all()
# If no result is found it returns an empty list
soup.find_all('video')

[]

In [15]:
# How many links are on the page?
len(links)

2353

In [16]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable
table = soup.find('tbody')

In [17]:
# Inspect the value of the variable
table

<tbody><tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase depicts a music lesson (c. 510 BC).</div><

In [18]:
# Inspect the type of the variable
type(table)

bs4.element.Tag

In [19]:
# A tag can be searched in the same way we search the whole document
table.find_all('td')

[<td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase depicts a music lesson (c. 510 BC).</div></td>,
 <td>Sound, silence, time</td>,
 <td>Various</td>,
 <td><a href="/wiki/Paleolithic" title="Paleolithic">Paleolithic</a> er

In [20]:
# Since we used find_all, the result is a list
len(table.find_all('td'))

4

## Navigating the tree

In [21]:
# A tag's children are stored in a list, accessed with .contents
table.contents

[<tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr>,
 <tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase depicts a music lesson (c. 510 BC).</div></td

In [22]:
len(table.contents)

5

In [23]:
table.contents[1]

<tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase depicts a music lesson (c. 510 BC).</div></td></tr>

In [24]:
# We can also go up the tree with .parent
table.parent

<table class="infobox" style="width:22em"><tbody><tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase 

In [25]:
# table.parent is also a tag
# Thus, we can use .parent on it as well
table.parent.parent

<div class="mw-parser-output"><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">form of art using sound and silence</div>
<table class="infobox" style="width:22em"><tbody><tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" src

In [26]:
# We use .parent to go up the tree
# But what about .children?
table.children

<list_iterator at 0x2c86787d9e8>

In [27]:
# If we want a list of an element's children, we need to use table.contents as shown before
# .children is an iterator over that list, 
# which means we can use it in a for loop to iterate over all the children

for child in table.children:
    print(child)

<tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr>
<tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/330px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/440px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg 2x" width="220"/></a><div>A painting on an ancient Greek vase depicts a music lesson (c. 510 BC).</div></td></

## Searching by attributes

In [28]:
# We can search for tags based on their attributes, in addition to their name
soup.find('div', id = 'siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

In [29]:
# There are two ways in which we can do that:

### Passing attributes as function parameters

In [30]:
# By writing them as function parameters
# Notice that since class is a reserved word, we write class_
soup.find_all('a', class_ = 'mw-jump-link')

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>]

In [31]:
# We can filter against multiple attributes at once
soup.find('a', class_ = 'mw-jump-link', href = '#p-search')

<a class="mw-jump-link" href="#p-search">Jump to search</a>

### Placing the attributes in a dictionary

In [32]:
# By writting the attributes in a dictionary
soup.find('a', attrs={ 'class':'mw-jump-link', 'href':'#p-search' })

<a class="mw-jump-link" href="#p-search">Jump to search</a>

In [33]:
soup.find('div', {'id' : 'footer'})

<div id="footer" role="contentinfo">
<ul id="footer-info">
<li id="footer-info-lastmod"> This page was last edited on 23 December 2019, at 04:25<span class="anonymous-show"> (UTC)</span>.</li>
<li id="footer-info-copyright">Text is available under the <a href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License" rel="license">Creative Commons Attribution-ShareAlike License</a><a href="//creativecommons.org/licenses/by-sa/3.0/" rel="license" style="display:none;"></a>;
additional terms may apply.  By using this site, you agree to the <a href="//foundation.wikimedia.org/wiki/Terms_of_Use">Terms of Use</a> and <a href="//foundation.wikimedia.org/wiki/Privacy_policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a href="//www.wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li>
</ul>
<ul id="footer-places">
<li id="footer-places-privacy"><a class="extiw" href="https://foundation.wikim

# Extracting data from the HTML tree

In [34]:
# Let's use some placeholder object to manipulate in the examples below
a = soup.find('a', class_ = 'mw-jump-link')
a

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

In [35]:
# We can obtain the name of the tag with the .name attribute
a.name

'a'

## Getting the attribute value

In [36]:
# We can access a tag’s attributes by treating the tag just like a dictionary

In [37]:
# First way
a['href']

'#mw-head'

In [38]:
# Notice how multi-valued attributes, such as class, return a list
a['class']

['mw-jump-link']

In [39]:
# Second way
a.get('href')

'#mw-head'

In [40]:
# Again, class returns a list
a.get('class')

['mw-jump-link']

#### Differences between these methods manifest when the key is missing

In [41]:
# tag['missing-key'] returns an error
# a['id'] will raise an error, if uncommented

In [42]:
# tag.get('missing-key') returns a default value None
a.get('id')

In [43]:
# We can use repr() function to display all special characters and combinations (None, \n...)
repr(a.get('id'))

'None'

In [44]:
# We can also get all attribute name-value pairs in a dictionary
a.attrs

{'class': ['mw-jump-link'], 'href': '#mw-head'}

## Extracting the text

### .string vs .text

In [45]:
# We can access the raw string of an element by using .string
a.string

'Jump to navigation'

In [46]:
# Alternativelly we can use .text
a.text

'Jump to navigation'

#### They exhibit different behaviour when the element contains more than one distinct string

In [47]:
# This paragraph has many nested elements, with lots of different fragments of text
p = soup.find_all('p')[1]
p

<p><b>Music</b> is an art form and <a href="/wiki/Culture" title="Culture">cultural</a> activity whose medium is sound organized in time. General <a class="mw-redirect" href="/wiki/Definitions_of_music" title="Definitions of music">definitions of music</a> include common elements such as <a href="/wiki/Pitch_(music)" title="Pitch (music)">pitch</a> (which governs <a href="/wiki/Melody" title="Melody">melody</a> and <a href="/wiki/Harmony" title="Harmony">harmony</a>), <a href="/wiki/Rhythm" title="Rhythm">rhythm</a> (and its associated concepts <a href="/wiki/Tempo" title="Tempo">tempo</a>, <a class="mw-redirect" href="/wiki/Meter_(music)" title="Meter (music)">meter</a>, and <a href="/wiki/Articulation_(music)" title="Articulation (music)">articulation</a>), <a href="/wiki/Dynamics_(music)" title="Dynamics (music)">dynamics</a> (loudness and softness), and the sonic qualities of <a href="/wiki/Timbre" title="Timbre">timbre</a> and <a href="/wiki/Texture_(music)" title="Texture (music)

In [48]:
# .text returns everything inside the element
p.text

'Music is an art form and cultural activity whose medium is sound organized in time. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques ranging from singing to rapping; there are solely instrumental pieces, solely vocal pieces (such as songs without instrumental accompaniment) and pieces that combine singing and instruments. The word derives from Greek μουσική (mousike; "art of the Muses").[1]\nSee glossary of musical terminology.\n'

In [49]:
# .string returns None when there is more than 1 string
p.string

In [50]:
repr(p.string)

'None'

In [51]:
p.parent

<div class="mw-parser-output"><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">form of art using sound and silence</div>
<table class="infobox" style="width:22em"><tbody><tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold;background:antiquewhite;">Music</th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-file-width="2952" decoding="async" height="138" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Music_lesson_Staatliche_Antikensammlungen_2421.jpg/220px-Music_lesson_Staatliche_Antikensammlungen_2421.jpg" src

In [52]:
# We can stack different operations one after the other
p.parent.text

'For other uses, see Music (disambiguation).\n\n\nform of art using sound and silence\nMusicA painting on an ancient Greek vase depicts a music lesson (c. 510\xa0BC).MediumSound, silence, timeOriginating cultureVariousOriginating eraPaleolithic era\nPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGeneral Gymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nTheatre\nVentriloquism\nvte\nMusic is an art form and cultural activity whose medium is sound organized in time. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques r

In [53]:
# semi-properly displayed text
print(p.parent.text)

For other uses, see Music (disambiguation).


form of art using sound and silence
MusicA painting on an ancient Greek vase depicts a music lesson (c. 510 BC).MediumSound, silence, timeOriginating cultureVariousOriginating eraPaleolithic era
Performing arts
Acrobatics
Ballet
Circus skills
Clown
Dance
General Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Theatre
Ventriloquism
vte
Music is an art form and cultural activity whose medium is sound organized in time. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques ranging from singing to rap

In [54]:
# We can also use .get_text() instead of .text
p.parent.get_text()

'For other uses, see Music (disambiguation).\n\n\nform of art using sound and silence\nMusicA painting on an ancient Greek vase depicts a music lesson (c. 510\xa0BC).MediumSound, silence, timeOriginating cultureVariousOriginating eraPaleolithic era\nPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGeneral Gymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nTheatre\nVentriloquism\nvte\nMusic is an art form and cultural activity whose medium is sound organized in time. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques r

In [55]:
print(p.parent.get_text())

For other uses, see Music (disambiguation).


form of art using sound and silence
MusicA painting on an ancient Greek vase depicts a music lesson (c. 510 BC).MediumSound, silence, timeOriginating cultureVariousOriginating eraPaleolithic era
Performing arts
Acrobatics
Ballet
Circus skills
Clown
Dance
General Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Theatre
Ventriloquism
vte
Music is an art form and cultural activity whose medium is sound organized in time. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques ranging from singing to rap

In [56]:
# We can also extract the whole text of the webpage
# CAUTION: This includes JavaScript text, CSS and other not directly displayed text
print(soup.text)





Music - Wikipedia
document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xh0NaApAAEYAAEMYOjcAAAAY","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":932061087,"wgRevisionId":932061087,"wgArticleId":18839,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles needing more detailed references","CS1 maint: archived copy as title","CS1: Julian–Gregorian uncertainty","Webarchive template wayback links","Pages containing links to subscription-only content",
"Wikipedia indefi

### .strings and .stripped_strings

In [57]:
# All strings inside an element can be accessed separatelly by using the .strings iterator

In [58]:
for s in p.strings:
    print(repr(s))

'Music'
' is an art form and '
'cultural'
' activity whose medium is sound organized in time. General '
'definitions of music'
' include common elements such as '
'pitch'
' (which governs '
'melody'
' and '
'harmony'
'), '
'rhythm'
' (and its associated concepts '
'tempo'
', '
'meter'
', and '
'articulation'
'), '
'dynamics'
' (loudness and softness), and the sonic qualities of '
'timbre'
' and '
'texture'
' (which are sometimes termed the "color" of a musical sound). Different '
'styles or types'
' of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of '
'instruments'
' and vocal techniques ranging from singing to '
'rapping'
'; there are solely '
'instrumental pieces'
', '
'solely vocal pieces'
' (such as songs without instrumental '
'accompaniment'
') and pieces that combine singing and instruments. The word derives from '
'Greek'
' '
'μουσική'
' ('
'mousike'
'; "art of the '
'Muses'
'").'
'[1]'
'\nSee '
'glossary of musical term

In [59]:
# The extra whitespace can be removed by using the .stripped_strings iterator instead
for s in p.stripped_strings:
    print(repr(s))

'Music'
'is an art form and'
'cultural'
'activity whose medium is sound organized in time. General'
'definitions of music'
'include common elements such as'
'pitch'
'(which governs'
'melody'
'and'
'harmony'
'),'
'rhythm'
'(and its associated concepts'
'tempo'
','
'meter'
', and'
'articulation'
'),'
'dynamics'
'(loudness and softness), and the sonic qualities of'
'timbre'
'and'
'texture'
'(which are sometimes termed the "color" of a musical sound). Different'
'styles or types'
'of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of'
'instruments'
'and vocal techniques ranging from singing to'
'rapping'
'; there are solely'
'instrumental pieces'
','
'solely vocal pieces'
'(such as songs without instrumental'
'accompaniment'
') and pieces that combine singing and instruments. The word derives from'
'Greek'
'μουσική'
'('
'mousike'
'; "art of the'
'Muses'
'").'
'[1]'
'See'
'glossary of musical terminology'
'.'


# Practical examples

## Links - absolute path URL

In [60]:
# Let's use the variable links we defined a couple of lectures ago for this example
# It contains all the 'a' tags on this page
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-hei

In [61]:
# Let's choose one link to manipulate
link = links[26]
link

<a href="/wiki/Culture" title="Culture">cultural</a>

In [62]:
# Get the link's text
link.string

'cultural'

In [63]:
# Extract the link's URL
link['href']

'/wiki/Culture'

In [64]:
# This is a relative URL
# To obtain the absolute URL address we will use urljoin

from urllib.parse import urljoin

In [65]:
# Now we need the address of the current page + the relative URL to compute the full-path URL
base_site

'https://en.wikipedia.org/wiki/Music'

In [66]:
relative_url = link['href']
relative_url

'/wiki/Culture'

In [67]:
full_url = urljoin(base_site, relative_url)
full_url

'https://en.wikipedia.org/wiki/Culture'

## Processing multiple links at once

In [68]:
# We will work with:
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-hei

In [69]:
# Examining the link's addresses
[l.get('href') for l in links]   # Note that if l['href'] was written instead of l.get('href'), this would produce an error

[None,
 '/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#p-search',
 '/wiki/Music_(disambiguation)',
 '/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg',
 '/wiki/Paleolithic',
 '/wiki/Performing_arts',
 '/wiki/Acrobatics',
 '/wiki/Ballet',
 '/wiki/List_of_circus_skills',
 '/wiki/Clown',
 '/wiki/Dance',
 '/wiki/Gymnastics',
 '/wiki/Magic_(illusion)',
 '/wiki/Mime_artist',
 None,
 '/wiki/Opera',
 '/wiki/Professional_wrestling',
 '/wiki/Puppetry',
 '/wiki/Public_speaking',
 '/wiki/Theatre',
 '/wiki/Ventriloquism',
 '/wiki/Template:Performing_arts',
 '/wiki/Template_talk:Performing_arts',
 'https://en.wikipedia.org/w/index.php?title=Template:Performing_arts&action=edit',
 '/wiki/Culture',
 '/wiki/Definitions_of_music',
 '/wiki/Pitch_(music)',
 '/wiki/Melody',
 '/wiki/Harmony',
 '/wiki/Rhythm',
 '/wiki/Tempo',
 '/wiki/Meter_(music)',
 '/wiki/Articulation_(music)',
 '/wiki/Dynamics_(music)',
 '/wiki/Timbre',
 '/wiki/Texture_(music)',
 '/wiki/Music_genre',
 '/wiki/Musica

In [70]:
# Notice that some links don't have URL (None appears)

# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]

In [71]:
# Obtaining the relative URLs
relative_urls = [link.get('href') for link in clean_links]
relative_urls

['/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#p-search',
 '/wiki/Music_(disambiguation)',
 '/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg',
 '/wiki/Paleolithic',
 '/wiki/Performing_arts',
 '/wiki/Acrobatics',
 '/wiki/Ballet',
 '/wiki/List_of_circus_skills',
 '/wiki/Clown',
 '/wiki/Dance',
 '/wiki/Gymnastics',
 '/wiki/Magic_(illusion)',
 '/wiki/Mime_artist',
 '/wiki/Opera',
 '/wiki/Professional_wrestling',
 '/wiki/Puppetry',
 '/wiki/Public_speaking',
 '/wiki/Theatre',
 '/wiki/Ventriloquism',
 '/wiki/Template:Performing_arts',
 '/wiki/Template_talk:Performing_arts',
 'https://en.wikipedia.org/w/index.php?title=Template:Performing_arts&action=edit',
 '/wiki/Culture',
 '/wiki/Definitions_of_music',
 '/wiki/Pitch_(music)',
 '/wiki/Melody',
 '/wiki/Harmony',
 '/wiki/Rhythm',
 '/wiki/Tempo',
 '/wiki/Meter_(music)',
 '/wiki/Articulation_(music)',
 '/wiki/Dynamics_(music)',
 '/wiki/Timbre',
 '/wiki/Texture_(music)',
 '/wiki/Music_genre',
 '/wiki/Musical_instrument',

In [72]:
# Transforming to absolute path URLs
full_urls = [urljoin(base_site, url) for url in relative_urls]
full_urls

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Music#mw-head',
 'https://en.wikipedia.org/wiki/Music#p-search',
 'https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg',
 'https://en.wikipedia.org/wiki/Paleolithic',
 'https://en.wikipedia.org/wiki/Performing_arts',
 'https://en.wikipedia.org/wiki/Acrobatics',
 'https://en.wikipedia.org/wiki/Ballet',
 'https://en.wikipedia.org/wiki/List_of_circus_skills',
 'https://en.wikipedia.org/wiki/Clown',
 'https://en.wikipedia.org/wiki/Dance',
 'https://en.wikipedia.org/wiki/Gymnastics',
 'https://en.wikipedia.org/wiki/Magic_(illusion)',
 'https://en.wikipedia.org/wiki/Mime_artist',
 'https://en.wikipedia.org/wiki/Opera',
 'https://en.wikipedia.org/wiki/Professional_wrestling',
 'https://en.wikipedia.org/wiki/Puppetry',
 'https://en.wikipedia.org/wiki/Public_speaking',
 'https://en.wikipedia.org/wiki/Theatre

In [73]:
# Extracting only URLs pointing to Wikipedia (internal URLs)
internal_links = [url for url in full_urls if 'wikipedia.org' in url]
internal_links

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Music#mw-head',
 'https://en.wikipedia.org/wiki/Music#p-search',
 'https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg',
 'https://en.wikipedia.org/wiki/Paleolithic',
 'https://en.wikipedia.org/wiki/Performing_arts',
 'https://en.wikipedia.org/wiki/Acrobatics',
 'https://en.wikipedia.org/wiki/Ballet',
 'https://en.wikipedia.org/wiki/List_of_circus_skills',
 'https://en.wikipedia.org/wiki/Clown',
 'https://en.wikipedia.org/wiki/Dance',
 'https://en.wikipedia.org/wiki/Gymnastics',
 'https://en.wikipedia.org/wiki/Magic_(illusion)',
 'https://en.wikipedia.org/wiki/Mime_artist',
 'https://en.wikipedia.org/wiki/Opera',
 'https://en.wikipedia.org/wiki/Professional_wrestling',
 'https://en.wikipedia.org/wiki/Puppetry',
 'https://en.wikipedia.org/wiki/Public_speaking',
 'https://en.wikipedia.org/wiki/Theatre

# Extracting data from nested tags

In [74]:
# Our objective now is to extract all links that can be found under a section heading
# Marked as 'Main article:' or 'See also:'
# By quick inspection, we see that these are contained in div tags with attribute 'role' set to 'note'

div_notes = soup.find_all("div", {"role": "note"})
div_notes

[<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Musical_notation" title="Musical notation">Musical notation</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Musical_improvisation" title="Musical improvisation">Musical improvisation</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Music_theory" title="Music theory">Music theory</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a class="mw-redirect" href="/wiki/Aspect_of_music" title="Aspect of 

In [75]:
div_notes[0]

<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>

In [76]:
# We can apply find() and find_all() to a tag in the same way we do it to the whole document
div_notes[0].find('a')

<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>

In [77]:
# A naive approach to get all links would be to use find
div_links = [div.find('a') for div in div_notes]
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a>,
 <a href="/wiki/Musical_notation" title="Musical notation">Musical notation</a>,
 <a href="/wiki/Musical_improvisation" title="Musical improvisation">Musical improvisation</a>,
 <a href="/wiki/Music_theory" title="Music theory">Music theory</a>,
 <a class="mw-redirect" href="/wiki/Aspect_of_music" title="Aspect of music">Aspect of music</a>,
 <a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>,
 <a href="/wiki/History_of_music" title="History of music">History of music</a>,
 <a href="/wiki/Music_of_Egypt" title="Music of Egypt">Music of Egypt</a>,
 <a href="/wiki/Music_of_Iran" title="Music of Iran">Music of Iran</a>,
 <a href="/wiki/History_of_music_in_the_biblical_period" title="History of music in the biblical period">History of music in the biblical period</a

In [78]:
len(div_links)

32

In [79]:
# However, some divs have more than 1 link
div_notes[6]

<div class="hatnote navigation-not-searchable" role="note">See also: <a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>, <a href="/wiki/Binary_form" title="Binary form">Binary form</a>, <a href="/wiki/Ternary_form" title="Ternary form">Ternary form</a>, <a class="mw-redirect" href="/wiki/Rondo_form" title="Rondo form">Rondo form</a>, <a href="/wiki/Variation_(music)" title="Variation (music)">Variation (music)</a>, and <a href="/wiki/Musical_development" title="Musical development">Musical development</a></div>

In [80]:
# This div has 6 links in it
div_notes[6].find_all('a')

[<a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>,
 <a href="/wiki/Binary_form" title="Binary form">Binary form</a>,
 <a href="/wiki/Ternary_form" title="Ternary form">Ternary form</a>,
 <a class="mw-redirect" href="/wiki/Rondo_form" title="Rondo form">Rondo form</a>,
 <a href="/wiki/Variation_(music)" title="Variation (music)">Variation (music)</a>,
 <a href="/wiki/Musical_development" title="Musical development">Musical development</a>]

In [81]:
# Therefore we need to use find_all
# Let's use a for loop

# Define initially empty list of links
div_links = []

for div in div_notes:
    anchors = div.find_all('a')
    
    # Need to add every link from anchors to div_links
    for a in anchors:
        div_links.append(a)
    
    # Can use div_links.extend(anchors) instead of the for loop
    

In [82]:
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a>,
 <a href="/wiki/Musical_notation" title="Musical notation">Musical notation</a>,
 <a href="/wiki/Musical_improvisation" title="Musical improvisation">Musical improvisation</a>,
 <a href="/wiki/Music_theory" title="Music theory">Music theory</a>,
 <a class="mw-redirect" href="/wiki/Aspect_of_music" title="Aspect of music">Aspect of music</a>,
 <a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>,
 <a href="/wiki/Binary_form" title="Binary form">Binary form</a>,
 <a href="/wiki/Ternary_form" title="Ternary form">Ternary form</a>,
 <a class="mw-redirect" href="/wiki/Rondo_form" title="Rondo form">Rondo form</a>,
 <a href="/wiki/Variation_(music)" title="Variation (music)">Variation (music)</a>,
 <a href="/wiki/Musical_development" title="Musical development">Musical 

In [83]:
# We now have a complete list
len(div_links)

44

In [84]:
# Let's get the URLs
note_urls = [urljoin(base_site, l.get('href')) for l in div_links]
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Music_theory',
 'https://en.wikipedia.org/wiki/Aspect_of_music',
 'https://en.wikipedia.org/wiki/Strophic_form',
 'https://en.wikipedia.org/wiki/Binary_form',
 'https://en.wikipedia.org/wiki/Ternary_form',
 'https://en.wikipedia.org/wiki/Rondo_form',
 'https://en.wikipedia.org/wiki/Variation_(music)',
 'https://en.wikipedia.org/wiki/Musical_development',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Music_of_Egypt',
 'https://en.wikipedia.org/wiki/Music_of_Iran',
 'https://en.wikipedia.org/wiki/Music_of_Afghanistan',
 'https://en.wikipedia.org/wiki/Music_of_Tajikistan',
 'https://en.wikipedia.org/wiki/Music_of_Sri_Lanka',
 'https://en.wikipedia.org/wiki/Music_of_Uzbekistan',
 'https://en.wikipedia.org/wi

In [85]:
len(note_urls)

44

# Scraping multiple pages automatically - Extracting all the text from the note URLs

In [86]:
# We will use the links we obtained above
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Music_theory',
 'https://en.wikipedia.org/wiki/Aspect_of_music',
 'https://en.wikipedia.org/wiki/Strophic_form',
 'https://en.wikipedia.org/wiki/Binary_form',
 'https://en.wikipedia.org/wiki/Ternary_form',
 'https://en.wikipedia.org/wiki/Rondo_form',
 'https://en.wikipedia.org/wiki/Variation_(music)',
 'https://en.wikipedia.org/wiki/Musical_development',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Music_of_Egypt',
 'https://en.wikipedia.org/wiki/Music_of_Iran',
 'https://en.wikipedia.org/wiki/Music_of_Afghanistan',
 'https://en.wikipedia.org/wiki/Music_of_Tajikistan',
 'https://en.wikipedia.org/wiki/Music_of_Sri_Lanka',
 'https://en.wikipedia.org/wiki/Music_of_Uzbekistan',
 'https://en.wikipedia.org/wi

In [87]:
# The objective is to get all the useful text from those wikipedia pages

# We will do that by extracting all text contained in a paragraph element,
# for all paragraphs on a page,
# for all pages (in note_urls)

In [88]:
# initialize list to store paragraph text for each webpage
par_text = []


# creating a loop counter
i = 0

# Loop through each URL in note_urls
for url in note_urls:
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    # checking if the request is successful
    if note_resp.status_code == 200:            # Everything is OK!
        print('URL #{0}: {1}'.format(i+1,url))    # print out the number of iteration and the URL to keep track of place in loop
    
    else:                                       # Something is wrong!
        print('Status code {0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html, 'lxml')
    
    # find all "p" tags on the webpage
    note_pars = note_soup.find_all("p")
    
    # Get the text from each "p" tag
    text = [p.text for p in note_pars]
    
    # Append text from each "p" tag to our list, par_text
    par_text.append(text)
    
    # Incrementing the loop counter
    i = i+1


URL #1: https://en.wikipedia.org/wiki/Music_(disambiguation)
URL #2: https://en.wikipedia.org/wiki/Musical_composition
URL #3: https://en.wikipedia.org/wiki/Musical_notation
URL #4: https://en.wikipedia.org/wiki/Musical_improvisation
URL #5: https://en.wikipedia.org/wiki/Music_theory
URL #6: https://en.wikipedia.org/wiki/Aspect_of_music
URL #7: https://en.wikipedia.org/wiki/Strophic_form
URL #8: https://en.wikipedia.org/wiki/Binary_form
URL #9: https://en.wikipedia.org/wiki/Ternary_form
URL #10: https://en.wikipedia.org/wiki/Rondo_form
URL #11: https://en.wikipedia.org/wiki/Variation_(music)
URL #12: https://en.wikipedia.org/wiki/Musical_development
URL #13: https://en.wikipedia.org/wiki/History_of_music
URL #14: https://en.wikipedia.org/wiki/Music_of_Egypt
URL #15: https://en.wikipedia.org/wiki/Music_of_Iran
URL #16: https://en.wikipedia.org/wiki/Music_of_Afghanistan
URL #17: https://en.wikipedia.org/wiki/Music_of_Tajikistan
URL #18: https://en.wikipedia.org/wiki/Music_of_Sri_Lanka
UR

In [89]:
# Inspecting the result for the first page
par_text[0]

['Music is an art form consisting of sound and silence, expressed through time.\n',
 'Music may also refer to:\n']

In [90]:
# We see that we have a list of all paragraph strings
# It would be more useful to have all the text as one string, not as a list of strings

# Merging all paragraphs of the first page into one long string
page_text = "".join(par_text[0])
page_text

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [91]:
# Let's do that for all pages

# Merging all paragraphs for all pages
page_text = ["".join(text) for text in par_text]

# Inspect the result for some webpage
page_text[0]

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [92]:
# Inspect result
print(page_text[4])


Music theory is the study of the practices and possibilities of music. The Oxford Companion to Music describes three interrelated uses of the term "music theory":
The first is what is otherwise called "rudiments", currently taught as the elements of notation, of key signatures, of time signatures, of rhythmic notation, and so on. [...] The second is the study of writings about music from ancient times onwards. [...] The third is an area of current musicological study that seeks to define processes and general principles in music—a sphere of research that can be distinguished from analysis in that it takes as its starting-point not the individual work or performance but the fundamental materials from which it is built.[1]Music theory is frequently concerned with describing how musicians and composers make music, including tuning systems and composition methods among other topics. Because of the ever-expanding conception of what constitutes music (see Definition of music), a more inclus

In [93]:
# Creating a dictionary with the (key,value) pairs being (url,text)
url_to_text = dict(zip(note_urls, page_text))  # You don't need to know the specifics of these functions

In [94]:
print(url_to_text['https://en.wikipedia.org/wiki/Music_theory'])


Music theory is the study of the practices and possibilities of music. The Oxford Companion to Music describes three interrelated uses of the term "music theory":
The first is what is otherwise called "rudiments", currently taught as the elements of notation, of key signatures, of time signatures, of rhythmic notation, and so on. [...] The second is the study of writings about music from ancient times onwards. [...] The third is an area of current musicological study that seeks to define processes and general principles in music—a sphere of research that can be distinguished from analysis in that it takes as its starting-point not the individual work or performance but the fundamental materials from which it is built.[1]Music theory is frequently concerned with describing how musicians and composers make music, including tuning systems and composition methods among other topics. Because of the ever-expanding conception of what constitutes music (see Definition of music), a more inclus

In [95]:
# A word of caution:
# We have not extracted all of the main content's text,
# as some text may be contained in lists and tables, outside of paragraphs we scraped