In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt

# set option
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

# Get data

In [2]:
url = 'https://www.worldometers.info/coronavirus'

In [3]:
# get data from url
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
content = soup.find('table', id = 'main_table_countries_yesterday2')

# put data into DataFrame
data = []
attributes = [item.text.replace('\xa0', ' ').replace('\n', '') for item in content.find_all('th')]
for i in content.find_all('tr')[1:]:
    data.append([j.text for j in i.find_all('td')])
df = pd.DataFrame(data, columns = attributes)
df = df[df['#'] != ''].set_index('#')
df

Unnamed: 0_level_0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,China,203334,2680.0,4776.0,51.0,169380.0,2982.0,29178.0,274.0,141.0,3.0,160000000.0,111163.0,1439323776.0,Asia,7079.0,301366.0,9.0,2.0,0.04,20.0
2,USA,82747175,38858.0,1018718.0,186.0,80506860.0,41509.0,1221597.0,1415.0,247359.0,3045.0,1001729381.0,2994507.0,334522343.0,North America,4.0,328.0,0.0,116.0,0.6,3652.0
3,India,43062097,2011.0,522223.0,,42523311.0,1970.0,16563.0,698.0,30658.0,372.0,834717702.0,594272.0,1404606308.0,Asia,33.0,2690.0,2.0,1.0,,12.0
4,Brazil,30355919,6456.0,662777.0,76.0,29411813.0,27459.0,281329.0,8318.0,140994.0,3078.0,63776166.0,296221.0,215299307.0,South America,7.0,325.0,3.0,30.0,0.4,1307.0
5,France,28317915,13984.0,145257.0,197.0,26083461.0,202981.0,2089197.0,1677.0,432101.0,2216.0,266484045.0,4066260.0,65535419.0,Europe,2.0,451.0,0.0,213.0,3.0,31879.0
6,Germany,24227680,86980.0,134817.0,171.0,21425200.0,182200.0,2667663.0,1980.0,287502.0,1600.0,122332384.0,1451676.0,84269739.0,Europe,3.0,625.0,1.0,1032.0,2.0,31656.0
7,UK,21978198,10582.0,173693.0,114.0,20955075.0,172725.0,849430.0,339.0,320697.0,2534.0,514985782.0,7514457.0,68532664.0,Europe,3.0,395.0,0.0,154.0,2.0,12395.0
8,Russia,18144788,7651.0,375061.0,159.0,17483483.0,8855.0,286244.0,2300.0,124239.0,2568.0,273400000.0,1871988.0,146047929.0,Europe,8.0,389.0,1.0,52.0,1.0,1960.0
9,S. Korea,16929564,34370.0,22243.0,110.0,,,,668.0,329695.0,433.0,15804065.0,307776.0,51349193.0,Asia,3.0,2309.0,3.0,669.0,2.0,310381.0
10,Italy,16161748,25287.0,162781.0,93.0,14755958.0,26738.0,1243009.0,416.0,268020.0,2699.0,211830644.0,3512910.0,60300620.0,Europe,4.0,370.0,0.0,419.0,2.0,20614.0


In [4]:
# export csv file
date = (dt.date.today() + dt.timedelta(days = -2)).strftime('%Y%m%d')
if (os.path.exists('output') == False):
    os.mkdir('output')
df.to_csv('output/Coronavirus_' + date + '.csv', index = False)

# Preprocessing

In [5]:
# check data type of each column
def open_object_dtype(s):
    return set(s.apply(lambda x: type(x)))

for col in df.columns:
    print(col, '\t', open_object_dtype(df[col]))

Country,Other 	 {<class 'str'>}
TotalCases 	 {<class 'str'>}
NewCases 	 {<class 'str'>}
TotalDeaths 	 {<class 'str'>}
NewDeaths 	 {<class 'str'>}
TotalRecovered 	 {<class 'str'>}
NewRecovered 	 {<class 'str'>}
ActiveCases 	 {<class 'str'>}
Serious,Critical 	 {<class 'str'>}
Tot Cases/1M pop 	 {<class 'str'>}
Deaths/1M pop 	 {<class 'str'>}
TotalTests 	 {<class 'str'>}
Tests/1M pop 	 {<class 'str'>}
Population 	 {<class 'str'>}
Continent 	 {<class 'str'>}
1 Caseevery X ppl 	 {<class 'str'>}
1 Deathevery X ppl 	 {<class 'str'>}
1 Testevery X ppl 	 {<class 'str'>}
New Cases/1M pop 	 {<class 'str'>}
New Deaths/1M pop 	 {<class 'str'>}
Active Cases/1M pop 	 {<class 'str'>}


In [6]:
# preprocessing value to number
df[['NewCases', 'NewDeaths', 'NewRecovered']] = \
    df[['NewCases', 'NewDeaths', 'NewRecovered']].applymap(lambda x: x.replace('+', ''))
df[df.columns.drop(['Country,Other', 'Continent'])] = \
    df[df.columns.drop(['Country,Other', 'Continent'])].applymap(lambda x: x.replace(',', '').replace(' ','')).replace('','0')

In [7]:
# preprocessing N/A to mean/median
df[['Country,Other', 'Continent']] = \
    df[['Country,Other', 'Continent']].replace('','N/A')
df = df.replace('N/A', np.nan)

In [8]:
# preprocessing change data type
def to_int(s):
    return [int(i) if not(i is np.nan) else np.nan for i in s]
def to_float(s):
    return [float(i) if not(i is np.nan) else np.nan for i in s]

for col in df.columns.drop(['Country,Other', 'Continent']):
    if col in ['New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop']:
        df[col] = to_float(df[col])
    else:
        df[col] = to_int(df[col])

In [9]:
# fill missing values by mean
for col in df.columns.drop(['Country,Other', 'Continent']):
    df[col].fillna(int(df[col].mean()), inplace = True)
    
df['TotalRecovered'] = to_int(df['TotalRecovered'])
df['NewRecovered'] = to_int(df['NewRecovered'])
df['ActiveCases'] = to_int(df['ActiveCases'])

In [10]:
df.dtypes

Country,Other           object
TotalCases               int64
NewCases                 int64
TotalDeaths              int64
NewDeaths                int64
TotalRecovered           int64
NewRecovered             int64
ActiveCases              int64
Serious,Critical         int64
Tot Cases/1M pop         int64
Deaths/1M pop            int64
TotalTests               int64
Tests/1M pop             int64
Population               int64
Continent               object
1 Caseevery X ppl        int64
1 Deathevery X ppl       int64
1 Testevery X ppl        int64
New Cases/1M pop       float64
New Deaths/1M pop      float64
Active Cases/1M pop    float64
dtype: object

In [11]:
df

Unnamed: 0_level_0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,China,203334,2680,4776,51,169380,2982,29178,274,141,3,160000000,111163,1439323776,Asia,7079,301366,9,2.0,0.04,20.0
2,USA,82747175,38858,1018718,186,80506860,41509,1221597,1415,247359,3045,1001729381,2994507,334522343,North America,4,328,0,116.0,0.6,3652.0
3,India,43062097,2011,522223,0,42523311,1970,16563,698,30658,372,834717702,594272,1404606308,Asia,33,2690,2,1.0,0.0,12.0
4,Brazil,30355919,6456,662777,76,29411813,27459,281329,8318,140994,3078,63776166,296221,215299307,South America,7,325,3,30.0,0.4,1307.0
5,France,28317915,13984,145257,197,26083461,202981,2089197,1677,432101,2216,266484045,4066260,65535419,Europe,2,451,0,213.0,3.0,31879.0
6,Germany,24227680,86980,134817,171,21425200,182200,2667663,1980,287502,1600,122332384,1451676,84269739,Europe,3,625,1,1032.0,2.0,31656.0
7,UK,21978198,10582,173693,114,20955075,172725,849430,339,320697,2534,514985782,7514457,68532664,Europe,3,395,0,154.0,2.0,12395.0
8,Russia,18144788,7651,375061,159,17483483,8855,286244,2300,124239,2568,273400000,1871988,146047929,Europe,8,389,1,52.0,1.0,1960.0
9,S. Korea,16929564,34370,22243,110,2121401,4925,86484,668,329695,433,15804065,307776,51349193,Asia,3,2309,3,669.0,2.0,310381.0
10,Italy,16161748,25287,162781,93,14755958,26738,1243009,416,268020,2699,211830644,3512910,60300620,Europe,4,370,0,419.0,2.0,20614.0


In [12]:
# export csv file
df.to_csv('output/data.csv', index = False)