In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime

fileDay = datetime.date.today().strftime('%Y%m%d')
fileYear = datetime.date.today().strftime('%Y')

country = []
totCase = []
continent = [] # append continent index in 2020. 5. 22.

nationLink = [] # www.worldometers.info/coronavirus/ + 'country/name'


def scrapingWorldometers():
    url = "https://www.worldometers.info/coronavirus/"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    colList = []
    dateList = []
    caseDataList = []
    deathDataList = []
    df = None
    
    #if yda:
        #table = soup.select_one('table#main_table_countries_yesterday')
    #else:    
    table = soup.select_one('table#main_table_countries_today')
    #pprint(table)

    columns = table.select('tr')
    isCol = True

    for col in columns:
        if isCol:
            cols = col.select('th')
            for c in cols:
                colList.append(c.text)
            isCol = False
        else:
            rows = col.select('td') # '#' [0] column was added, so that index changed in 5. 15.
            cases = int(rows[2].text.strip().replace(',',''))
            if (cases >= 1000) and rows[1].a: 
                country.append(rows[1].text.strip())
                nationLink.append(rows[1].a["href"])
                totCase.append(cases)
                continent.append(rows[15].text.strip()) # newRecov column was added, so that index changed in 5. 29. # newRecov was removed in 5. 30. this column was added again in 2020. 6. 2.
            else:
                continue
                
    # Réunion, Curaçao => Reunion, Curacao
    if "Réunion" in country:
        idx = country.index("Réunion")
        country[idx] = "Reunion"
    
    if "Curaçao" in country:
        idx = country.index("Curaçao")
        country[idx] = "Curacao"

            
    for link in nationLink:
        html = requests.get(url + link).text
        soup = BeautifulSoup(html, 'html.parser')
        jsList = soup.find_all('script', attrs={'type' : "text/javascript"})
        for js in jsList:
            if js.text.find("Highcharts") != -1:
                if js.text.find("'coronavirus-cases-linear'") != -1:
                    caseScript = js.text
                    frontCut = caseScript[caseScript.find("categories:"):]
                    caseDaysStartIdx = frontCut.find("[") + 1 # start from inside of []
                    caseDaysEndIdx = frontCut.find("]")
                    caseDaysList = frontCut[caseDaysStartIdx:caseDaysEndIdx].split(",")
                    
                    startDate = datetime.datetime.strptime(caseDaysList[0].strip('"') + ' 2020', "%b %d %Y")
                    endDate = datetime.datetime.strptime(caseDaysList[-1].strip('"') + ' ' + fileYear, "%b %d %Y")
                    dateList = pd.date_range(startDate, endDate)
                    
                    frontCut = frontCut[frontCut.find("data:"):]
                    caseDataStartIdx = frontCut.find("[") + 1 # start from inside of []
                    caseDataEndIdx = frontCut.find("]")
                    caseDataList = frontCut[caseDataStartIdx:caseDataEndIdx].split(",")
                    
                elif js.text.find("coronavirus-deaths-linear") != -1:
                    deathScript = js.text
                    frontCut = deathScript[deathScript.find("data:"):]
                    deathDataStartIdx = frontCut.find("[") + 1 # start from inside of []
                    deathDataEndIdx = frontCut.find("]")
                    deathDataList = frontCut[deathDataStartIdx:deathDataEndIdx].split(",")
        
        nationDict = {"Country": country[nationLink.index(link)], "Date":dateList, "Continent": continent[nationLink.index(link)], "TotalCases":caseDataList, "TotalDeaths":deathDataList}
        dfByNation = pd.DataFrame(nationDict)
        df = pd.concat([df, dfByNation])
        
    df["Date"] = df["Date"].astype(str) # datetime64[ns] -> str
    df = df.set_index(["Country", "Date", "Continent"]) # append continent index in 2020. 5. 22.
    
    formerdf = pd.read_csv("./covidDataset/formerCoronaWorld.csv") # former data was added in 2020. 6. 6.
    formerdf = formerdf.set_index(["Country", "Date" ,"Continent"]) # multi index
    
    fusiondf = pd.concat([df, formerdf])
    fusiondf.sort_index(inplace=True)
    return fusiondf

In [2]:
def toCSV(df):
    csvFile = './covidDataset/' + fileDay + 'CoronaWorld.csv' # set directory properly
    df.to_csv(csvFile)
    print(fileDay, "today csv complete")
    
def toJSON(df, orient='split'):
    jsonFile = './covidDataset/' + fileDay + 'CoronaWorld.json'
    df.to_json(jsonFile, orient)
    print(fileDay, "today json complete")

In [3]:
covidDataFrame = scrapingWorldometers()
print(covidDataFrame) # only china start from 2020-01-22

                                 TotalCases TotalDeaths
Country     Date       Continent                       
Afghanistan 2020-02-15 Asia               0           0
            2020-02-16 Asia               0           0
            2020-02-17 Asia               0           0
            2020-02-18 Asia               0           0
            2020-02-19 Asia               0           0
...                                     ...         ...
Zambia      2020-06-01 Africa          1089           7
            2020-06-02 Africa          1089           7
            2020-06-03 Africa          1089           7
            2020-06-04 Africa          1089           7
            2020-06-05 Africa          1089           7

[13840 rows x 2 columns]


In [4]:
toCSV(covidDataFrame)
toJSON(covidDataFrame)

20200606 today csv complete
20200606 today json complete
