In [1]:
from bs4 import BeautifulSoup
import urllib.request
import re

In [29]:

url =  'https://www.worldometers.info/coronavirus/country/brazil/'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

**Filter scripts that have data**

In [30]:
allScripts = soup.findAll('script')

filteredScripts = []

for script in allScripts:
    if 'Highcharts.chart' in script.text:
        filteredScripts.append(script)

In [31]:
for i, script in enumerate(filteredScripts):
    with open('script{}.txt'.format(i), 'w') as f:
        f.write(script.text)

**Filter table name**

In [5]:
def get_table_name(script):
    text = script.text
    
    keyword = 'Highcharts.chart\('
    start_indexs = [m.start() for m in re.finditer(keyword, text)]
    
    names = []
    for start_index in start_indexs:
        end_index = text.find(',', start_index + len(keyword))
    
        names.append(text[start_index + len(keyword) : end_index - 1])

    return names
    

In [6]:
for script in filteredScripts:
    print(get_table_name(script))

['total-currently-infected-linear']
['deaths-cured-outcome-small']
['coronavirus-cases-linear', 'coronavirus-cases-log']
['graph-cases-daily']
['graph-active-cases-total']
['coronavirus-deaths-linear', 'coronavirus-deaths-log']
['graph-deaths-daily']
['cases-cured-daily']
['deaths-cured-outcome']


**Filter x data**

In [None]:
# Convert Jul 21, 2022 -> real datetime

In [26]:
def get_x_data(script):
    text = script.text
    
    keyword = 'xAxis'
    start_indexs = [m.start() for m in re.finditer(keyword, text)]
    
    dates = []
    for start_index in start_indexs:
        index_1 = text.find('[', start_index + len(keyword))
        index_2 = text.find(']', start_index + len(keyword))

        data = text[index_1 + 1 : index_2]
        data = [d for d in data.split('"') if d != ',' and d != ""]
        dates.append(data)
    
    return dates
    
dates = get_x_data(filteredScripts[0])
print('Len list: ', len(dates))
print('Sample: ')
for i in range(10):
    print(dates[0][i])

Len list:  1
Sample: 
Feb 15, 2020
Feb 16, 2020
Feb 17, 2020
Feb 18, 2020
Feb 19, 2020
Feb 20, 2020
Feb 21, 2020
Feb 22, 2020
Feb 23, 2020
Feb 24, 2020


In [21]:
dates = get_x_data(filteredScripts[1])
print('Len list: ', len(dates))
print('Sample: ')
for i in range(10):
    print(dates[0][i])

Len list:  1
Sample: 
Feb 02, 2020
Feb 03, 2020
Feb 04, 2020
Feb 05, 2020
Feb 06, 2020
Feb 07, 2020
Feb 08, 2020
Feb 09, 2020
Feb 10, 2020
Feb 11, 2020


In [28]:
def get_y_data(script):
    text = script.text
    
    keyword = 'data'
    start_indexs = [m.start() for m in re.finditer(keyword, text)]
    
    counts = []
    for start_index in start_indexs:
        index_1 = text.find('[', start_index + len(keyword))
        index_2 = text.find(']', start_index + len(keyword))

        data = text[index_1 + 1 : index_2]
        nums = [float(d) for d in data.split(',')]
        counts.append(nums)
    
    return counts

counts = get_y_data(filteredScripts[0])
print('Len list: ', len(counts))
print('Sample: ')
for i in range(10):
    print(counts[0][i])

Len list:  2
Sample: 
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


**Statistics**

In [1]:

import os

data_dir = 'assets/data'
n_countries = len(os.listdir(data_dir))
print('Number of countries: ', n_countries)
    
    

Number of countries:  229


In [2]:
cnts = {}
for country in os.listdir(data_dir):
    for table in os.listdir(os.path.join(data_dir, country)):
        name = table[:-4]
        if name not in cnts:
            cnts[name] = 1
        else:
            cnts[name] += 1
            
for k, v in list(cnts.items()):
    print('{}: {}'.format(k, v))
    

coronavirus-cases-linear: 229
coronavirus-cases-log: 229
coronavirus-deaths-linear: 223
coronavirus-deaths-log: 223
graph-active-cases-total: 210
graph-cases-daily-1: 229
graph-cases-daily-2: 229
graph-cases-daily-3: 229
graph-deaths-daily-1: 221
graph-deaths-daily-2: 221
graph-deaths-daily-3: 221
cases-cured-daily-1: 25
cases-cured-daily-2: 25
deaths-cured-outcome-1: 25
deaths-cured-outcome-2: 25
deaths-cured-outcome-small-1: 26
deaths-cured-outcome-small-2: 26
total-currently-infected-linear: 16


**Preporcess data**

In [4]:
# Convert datetime
import datetime
tbk = 'Feb 15, 2020'
tbk = datetime.datetime.strptime(tbk, '%b %d, %Y').strftime('%d/%m/%Y')
tbk

'15/02/2020'

In [5]:
def format_time(t):
    return datetime.datetime.strptime(t, '%b %d, %Y').strftime('%d/%m/%Y')

In [25]:
def to_date(s):
    return datetime.datetime.strptime(s, '%b %d, %Y')

def to_str(d):
    return d.strftime('%d/%m/%Y')

In [10]:
import os

data_dir = 'assets/data'

countries = os.listdir(data_dir)

required = [
    'coronavirus-cases-linear',
    'coronavirus-deaths-linear',
    'graph-active-cases-total',
    'graph-cases-daily-1',
    'graph-deaths-daily-1'
]

filtered_countries = []
for country in countries:
    tables = [t[:-4] for t in os.listdir(os.path.join(data_dir, country))]
    if all([r in tables for r in required]):
        filtered_countries.append(country)

In [11]:
len(filtered_countries)

202

In [15]:
import pandas as pd

date_list = []

for country in filtered_countries:
    for r in required:
        path = os.path.join(data_dir, country, '{}.csv'.format(r))
        df = pd.read_csv(path)
        
        date_list.extend(df['date'].to_list())

date_list = sorted([to_date(d) for d in set(date_list)])
        
            

In [19]:
len(date_list)

1112

In [21]:
date_list[0:10]

[datetime.datetime(2020, 1, 22, 0, 0),
 datetime.datetime(2020, 1, 23, 0, 0),
 datetime.datetime(2020, 1, 24, 0, 0),
 datetime.datetime(2020, 1, 25, 0, 0),
 datetime.datetime(2020, 1, 26, 0, 0),
 datetime.datetime(2020, 1, 27, 0, 0),
 datetime.datetime(2020, 1, 28, 0, 0),
 datetime.datetime(2020, 1, 29, 0, 0),
 datetime.datetime(2020, 1, 30, 0, 0),
 datetime.datetime(2020, 1, 31, 0, 0)]

In [24]:
date_list[-1] - date_list[0]

datetime.timedelta(days=1111)