## Setup

In [1]:
# load the packages
import requests
from bs4 import BeautifulSoup

In [2]:
base_site = "https://en.wikipedia.org/wiki/List_of_national_capitals_by_population"

In [3]:
# connect to webpage
r = requests.get(base_site)
r.status_code

200

In [4]:
# get the HTML content from the webpage
html = r.content

# Extracting tables with Beautiful Soup

In [5]:
# Let's see how extracting the tables would be done with conventional Beautiful Soup methods

In [6]:
# Create the soup
soup = BeautifulSoup(html, 'lxml')

In [7]:
# Tables are marked with the 'table' tag in HTML
soup.find_all("table")

[<table class="navbox vertical-navbox" style="clear:right; float:right; margin:0 0 0.5em 1em; width:16.0em; text-align:left; font-size:95%; line-height:1.1em; border-collapse:collapse;">
 <tbody><tr>
 <th style="padding:4px;"><a href="/wiki/Lists_of_capitals" title="Lists of capitals">Lists of capitals</a>
 </th></tr>
 <tr>
 <td style="padding:4px;"><b>Of countries</b>
 <ul><li><a href="/wiki/List_of_national_capitals" title="List of national capitals">in alphabetical order</a></li>
 <li><a href="/wiki/List_of_national_capitals_by_latitude" title="List of national capitals by latitude">by latitude</a></li>
 <li><a class="mw-selflink selflink">by population</a></li>
 <li><a href="/wiki/List_of_former_national_capitals" title="List of former national capitals">Former</a></li>
 <li><a href="/wiki/List_of_purpose-built_national_capitals" title="List of purpose-built national capitals">Purpose-built</a></li>
 <li><a class="mw-redirect" href="/wiki/List_of_countries_by_national_capital,_larg

In [8]:
# The main table on the page
table = soup.find_all("table")[1]
table

<table class="wikitable sortable" style="text-align:right">
<tbody><tr>
<th>Rank</th>
<th style="width:110pt;">Country/Territory</th>
<th>Capital</th>
<th>Population</th>
<th>Year
</th>
<th>% of<br/>country's<br/>population
</th></tr>
<tr>
<td>1</td>
<td align="left"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/23px-Flag_of_the_People%27s_Republic_of_China.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/35px-Flag_of_the_People%27s_Republic_of_China.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/45px-Flag_of_the_People%27s_Republic_of_China.svg.png 2x" width="23"/> </span><a href="/wiki/China" title="China">China PR</a></td>
<td><b><a href="/wiki/Beijing" title="Beijing">Beiji

In [9]:
# Recall:
# 'th' marks a column heading
# 'tr' marks a table row
# 'td' marks a table cell (inside a row)

In [10]:
# Extracting all rows
table.find_all('tr') # Note that the first row contains the headings

[<tr>
 <th>Rank</th>
 <th style="width:110pt;">Country/Territory</th>
 <th>Capital</th>
 <th>Population</th>
 <th>Year
 </th>
 <th>% of<br/>country's<br/>population
 </th></tr>, <tr>
 <td>1</td>
 <td align="left"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/23px-Flag_of_the_People%27s_Republic_of_China.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/35px-Flag_of_the_People%27s_Republic_of_China.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/45px-Flag_of_the_People%27s_Republic_of_China.svg.png 2x" width="23"/> </span><a href="/wiki/China" title="China">China PR</a></td>
 <td><b><a href="/wiki/Beijing" title="Beijing">Beijing</a></b></td>
 <td><span data-sort-value="7007215420

In [11]:
# Inspecting the contents of first row
table.find_all('tr')[0].contents

['\n',
 <th>Rank</th>,
 '\n',
 <th style="width:110pt;">Country/Territory</th>,
 '\n',
 <th>Capital</th>,
 '\n',
 <th>Population</th>,
 '\n',
 <th>Year
 </th>,
 '\n',
 <th>% of<br/>country's<br/>population
 </th>]

In [12]:
# The 'Capital' is the 6th element
table.find_all('tr')[0].contents[5]

<th>Capital</th>

In [13]:
# Extracting the 'Capital' column (the first element being the heading)
capitals = [row.contents[5].text for row in table.find_all('tr')]
capitals

['Capital',
 'Beijing',
 'Tokyo',
 'Moscow',
 'Kinshasa',
 'Jakarta',
 'Seoul',
 'Cairo',
 'Mexico City',
 'Dhaka',
 'London',
 'Lima',
 'Tehran',
 'Bangkok',
 'Hanoi',
 'Riyadh',
 'Hong Kong',
 'Bogotá',
 'Baghdad',
 'Santiago',
 'Singapore',
 'Ankara',
 'Berlin',
 'Damascus',
 'Algiers',
 'Madrid',
 'Pyongyang',
 'Kabul',
 'Nairobi',
 'Addis Ababa',
 'Buenos Aires',
 'Rome',
 'Kiev',
 'Yaoundé',
 'Taipei',
 'Brasília',
 'Amman',
 'Luanda',
 'Pretoria',
 'Paris',
 'Tashkent',
 'Baku',
 'Havana',
 'Phnom Penh',
 'Bucharest',
 'Caracas',
 'Brazzaville',
 'Rabat',
 'Manila',
 'Vienna',
 'Khartoum',
 'Budapest',
 'Warsaw',
 'Minsk',
 'Kampala',
 'Accra',
 'Antananarivo',
 'Beirut',
 'Quito (de iure)  Guayaquil (de facto) seat-of-government',
 'Harare',
 'Doha',
 "Sana'a",
 'Conakry',
 'Kuala Lumpur',
 'Montevideo',
 'Lusaka',
 'Bamako',
 'Sofia',
 'Prague',
 'Port-au-Prince',
 'Tripoli',
 'Dublin',
 'Kuwait City',
 'Belgrade',
 'Santo Domingo',
 'Mogadishu',
 'Yerevan',
 'Maputo',
 'Freet

In [14]:
# This is only one column; we have to do the same for the rest

In [15]:
# At this point you probably realize how tedious this process is:

# First, we have to manually inspect the elements, to be able to scrape them
# Second, we have to repeat the same commands for every column
# Third, the data has 'lost' its initial tabular form, we have to reconstruct it manually

In [16]:
# There should be a better way

# Using Pandas to extract tables

In [17]:
import pandas as pd

In [18]:
# Pandas provides an extremely easy-to-use method for table extraction

# It actually uses Beautiful Soup in the background,
# performing all the operations we executed above automatically

In [19]:
# To extract all tables on a page, use pandas.read_html()
# It takes either raw HTML or the page URL as a parameter
tables = pd.read_html(base_site)

In [20]:
# It identifies all of the tables on the page and returns them as a list of dataframes
type(tables)

list

In [21]:
type(tables[0])

pandas.core.frame.DataFrame

In [22]:
# We can check to see that pandas found four tables on the webpage
len(tables)

4

In [23]:
# Getting the full main table is now straightforward
tables[1]

Unnamed: 0,Rank,Country/Territory,Capital,Population,Year,% ofcountry'spopulation
0,1,China PR,Beijing,"21,542,000[1]",2010,1.5%
1,2,Japan,Tokyo,"13,929,286[2]",2017,11.03%
2,3,Russia,Moscow,"12,506,468[3]",2011,8.52%
3,4,DR Congo,Kinshasa,"11,855,000[4]",2012,12.9%
4,5,Indonesia,Jakarta,"10,075,310[5]",2011,3.76%
5,6,South Korea,Seoul,"9,838,892[6]",2015,19.03%
6,7,Egypt,Cairo,9500000,2012,9.54%
7,8,Mexico,Mexico City,"8,918,653[7]",2015,7.05%
8,9,Bangladesh,Dhaka,"8,906,039 [8]",2011,5.52%
9,10,United Kingdom England,London,"8,908,081[9]",2015,13.19%


In [24]:
# Notice that Pandas not only extracts all columns and headings,
# but also deals with missing data (as can be seen in the Iraq, 'Year' column)

In [25]:
# Getting the column headings
tables[1].columns

Index(['Rank', 'Country/Territory', 'Capital', 'Population', 'Year',
       '% ofcountry'spopulation'],
      dtype='object')

In [26]:
# Because of the way HTML is coded, there may be some messiness, which could require cleaning

In [27]:
# Similar to BeautifulSoup, we can add tag attributes as parameters
# This allows us to filter our search for tables
filtered_tables = pd.read_html(base_site, attrs = {"class": "navbox"})
filtered_tables

[                                   Lists of capitals
 0  Of countries in alphabetical order by latitude...
 1  Of country subdivisions Capitals outside the t...
 2                                                vte]

In [28]:
# Still a list, even though only one such table exist
len(filtered_tables)

1

In [29]:
# As mentioned, we can also pass the retrieved HTML directly into the method (instead of URL)
pd.read_html(html, attrs = {"class": "wikitable sortable"})

[    Rank                                  Country/Territory  \
 0      1                                           China PR   
 1      2                                              Japan   
 2      3                                             Russia   
 3      4                                           DR Congo   
 4      5                                          Indonesia   
 5      6                                        South Korea   
 6      7                                              Egypt   
 7      8                                             Mexico   
 8      9                                         Bangladesh   
 9     10                             United Kingdom England   
 10    11                                               Peru   
 11    12                                               Iran   
 12    13                                           Thailand   
 13    14                                            Vietnam   
 14    15                               