<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-the-pprint,-requests,-and-BeautifulSoup-libraries" data-toc-modified-id="Import-the-pprint,-requests,-and-BeautifulSoup-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import the pprint, requests, and BeautifulSoup libraries</a></span></li><li><span><a href="#Parse-the-web-page-and-get-the-header-row-of-the-table" data-toc-modified-id="Parse-the-web-page-and-get-the-header-row-of-the-table-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parse the web page and get the header row of the table</a></span></li><li><span><a href="#Get-the-data-from-the-table-cells" data-toc-modified-id="Get-the-data-from-the-table-cells-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get the data from the table cells</a></span></li><li><span><a href="#Load-the-data-into-pandas" data-toc-modified-id="Load-the-data-into-pandas-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load the data into pandas</a></span></li><li><span><a href="#Fix-the-column-names-and-convert-the-data-to-numeric-values" data-toc-modified-id="Fix-the-column-names-and-convert-the-data-to-numeric-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Fix the column names and convert the data to numeric values</a></span></li></ul></div>

# Import the pprint, requests, and BeautifulSoup libraries

In [1]:
# import pandas, numpy, json, pprint, and requests
import pandas as pd
import numpy as np
import json
import pprint
import requests
from bs4 import BeautifulSoup

In [2]:
# pd.set_option('display.width', 80)
# pd.set_option('display.max_columns',6)

# Parse the web page and get the header row of the table

In [3]:
webpage = requests.get(
    'http://www.alrb.org/datacleaning/covidcaseoutliers.html')

In [4]:
soup = BeautifulSoup(webpage.text, 'html.parser')

In [5]:
theadrows = soup.find('table', {'id': 'tblDeaths'}).thead.find_all('th')

In [6]:
type(theadrows)

bs4.element.ResultSet

In [8]:
labelcols = [j.get_text() for j in theadrows]

In [9]:
labelcols[0]

'Country'

In [10]:
labelcols[0] = 'rowheadings'

In [11]:
labelcols

['rowheadings',
 'Cases',
 'Deaths',
 'Cases per Million',
 'Deaths per Million',
 'population',
 'population_density',
 'median_age',
 'gdp_per_capita',
 'hospital_beds_per_100k']

# Get the data from the table cells

In [12]:
rows = soup.find('table', {'id': 'tblDeaths'}).tbody.find_all('tr')

In [13]:
datarows = []

In [14]:
labelrows = []

In [15]:
for row in rows:
    rowlabels = row.find('th').get_text()
    cells = row.find_all('td', {'class': 'data'})
    if (len(rowlabels) > 3):
        labelrows.append(rowlabels)
    if (len(cells) > 0):
        cellvalues = [j.get_text() for j in cells]
        datarows.append(cellvalues)

In [16]:
pprint.pprint(datarows[0:2])

[['9,394', '653', '214', '15', '43,851,043', '17', '29', '13,914', '1.9'],
 ['16,642', '668', '1848', '74', '9,006,400', '107', '44', '45,437', '7.4']]


In [17]:
pprint.pprint(labelrows[0:2])

['Algeria', 'Austria']


In [18]:
for i in range(len(datarows)):
    datarows[i].insert(0, labelrows[i])

# Load the data into pandas

In [19]:
totaldeaths = pd.DataFrame(datarows, columns=labelcols)

In [20]:
totaldeaths.head()

Unnamed: 0,rowheadings,Cases,Deaths,Cases per Million,Deaths per Million,population,population_density,median_age,gdp_per_capita,hospital_beds_per_100k
0,Algeria,9394,653,214,15,43851043,17,29,13914,1.9
1,Austria,16642,668,1848,74,9006400,107,44,45437,7.4
2,Bangladesh,47153,650,286,4,164689383,1265,28,3524,0.8
3,Belgium,58381,9467,5037,817,11589616,376,42,42659,5.6
4,Brazil,514849,29314,2422,138,212559409,25,34,14103,2.2


In [21]:
totaldeaths.dtypes

rowheadings               object
Cases                     object
Deaths                    object
Cases per Million         object
Deaths per Million        object
population                object
population_density        object
median_age                object
gdp_per_capita            object
hospital_beds_per_100k    object
dtype: object

# Fix the column names and convert the data to numeric values

In [22]:
totaldeaths.columns = totaldeaths.columns.str.replace(' ', '_').str.lower()

In [23]:
# col[1:-1] - Column - 'cases' to 'gdp_per_capita'
for col in totaldeaths.columns[1:-1]:
    totaldeaths[col] = totaldeaths[col].str.replace('[^0-9]',
                                                    '').astype('int64')

  


In [24]:
totaldeaths['hospital_beds_per_100k'] = totaldeaths[
    'hospital_beds_per_100k'].astype('float')

In [25]:
totaldeaths.head()

Unnamed: 0,rowheadings,cases,deaths,cases_per_million,deaths_per_million,population,population_density,median_age,gdp_per_capita,hospital_beds_per_100k
0,Algeria,9394,653,214,15,43851043,17,29,13914,1.9
1,Austria,16642,668,1848,74,9006400,107,44,45437,7.4
2,Bangladesh,47153,650,286,4,164689383,1265,28,3524,0.8
3,Belgium,58381,9467,5037,817,11589616,376,42,42659,5.6
4,Brazil,514849,29314,2422,138,212559409,25,34,14103,2.2


In [26]:
totaldeaths.dtypes

rowheadings                object
cases                       int64
deaths                      int64
cases_per_million           int64
deaths_per_million          int64
population                  int64
population_density          int64
median_age                  int64
gdp_per_capita              int64
hospital_beds_per_100k    float64
dtype: object