### Scrape data from HTML tables into a DataFrame using BeautifulSoup and Pandas

In [2]:
!pip3 install pandas
!pip3 install beautifulsoup4
!pip3 install html5lib==1.1

Collecting html5lib==1.1
  Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib
Successfully installed html5lib-1.1


In [9]:
import pandas as pd
import requests
from bs4 import  BeautifulSoup


In [10]:
#The below url contains html tables with data about world population.
url = 'https://en.wikipedia.org/wiki/World_population'

In [6]:
# get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text

In [12]:
soup = BeautifulSoup(data, 'html5lib')

In [13]:
# find all html tables in web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

In [15]:
# we can see how many tables were found by checking the length of the tables list
len(tables)

29

In [16]:
for index, table in enumerate(tables):
    if('10 most densely populated countries' in str(table)):
        table_index = index
print(table_index)

7


In [17]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
  <sup class="reference" id="cite_ref-:10_106-0">
   <a href="#cite_note-:10-106">
    [102]
   </a>
  </sup>
 </caption>
 <tbody>
  <tr>
   <th scope="col">
    Rank
   </th>
   <th scope="col">
    Country
   </th>
   <th scope="col">
    Population
   </th>
   <th scope="col">
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th scope="col">
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <span class="mw-image-border" typeof="mw:File">
      <span>
       <img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/4

In [21]:
population_data = pd.DataFrame(columns=['Rank', 'Country', 'Population', 'Area', 'Density'])
for row in tables[table_index].tbody.find_all('tr'):
    col = row.find_all('td')
    if col:
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        row_data = pd.DataFrame({'Rank': [rank], 'Country': [country], 'Population': [population], 'Area': [area], 'Density': [density]})
        population_data = pd.concat([population_data, row_data], ignore_index=True)
population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,\n Palestine[103]\n\n,5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419


### Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html

In [22]:
pd.read_html(str(tables[5]), flavor='bs4')[0]


Unnamed: 0_level_0,#,Most populous countries,2000,2015,2030[A],Graphs are temporarily unavailable due to technical issues.
Unnamed: 0_level_1,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Unnamed: 0_level_5,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
Unnamed: 0_level_6,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6
Unnamed: 0_level_7,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7
Unnamed: 0_level_8,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_8,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8
Unnamed: 0_level_9,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_9,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9
Unnamed: 0_level_10,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_10,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10
Unnamed: 0_level_11,Graphs are temporarily unavailable due to technical issues.,Unnamed: 1_level_11,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11
0,,Graphs are temporarily unavailable due to tech...,,,,
1,1,China[B],1270,1376,1416,
2,2,India,1053,1311,1528,
3,3,United States,283,322,356,
4,4,Indonesia,212,258,295,
5,5,Pakistan,136,208,245,
6,6,Brazil,176,206,228,
7,7,Nigeria,123,182,263,
8,8,Bangladesh,131,161,186,
9,9,Russia,146,146,149,


In [23]:
population_data_read_html = pd.read_html(str(tables[5]),flavor='bs4')[0]
population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,\n Palestine[103]\n\n,5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419


We can also use the match parameter to select the specific table we want. If the table contains a string matching the text it will be read.

In [24]:
pd.read_html(url,match='10 most densely populated countries', flavor='bs4')[0]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[103],5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419
