In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
import re
from bs4 import BeautifulSoup

In [2]:
url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_outbreak_data/United_States_medical_cases_chart"

page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
columns = ["date", "deaths", "recoveries", "actives", "#cases", "#deaths"]

In [4]:
rows = []

paren_pattern = re.compile(r"\(.*\)")
comma_pattern = re.compile(r",")

# Start horrendous spaghetti code for getting data from Wikipedia
# td with class 'bb-lr' has deaths (background-color: #A50026), recoveries (SkyBlue), and actives (Tomato) in divs

for i, trow in enumerate(soup.find_all("tr", **{"class": "mw-collapsed", "class": "mw-collapsed"})):
    
    # insure all dates are represented
    
    date_field = trow.find_all("td")[0]
    
    if date_field.text == "⋮":
        continue
        
    row_date = datetime.datetime.strptime(date_field.text, "%Y-%m-%d")
    
    if i == 0:
        date = row_date
    else:
        date += datetime.timedelta(days=1)
  
    while date < row_date:
        temp_row = [date, np.nan, np.nan, np.nan, np.nan, np.nan]
        rows.append(dict(zip(columns, temp_row)))
        date += datetime.timedelta(days=1)
    
    # end insure all dates
    
    # pull out data
        
    row = {}
    row_data = []
    
    for tag in trow.find_all("td"):
        if "bb-lr" in tag["class"]:
            # get histogram data
            for div in tag.find_all("div"):
                if "#A50026" in div["style"]:
                    row[columns[1]] = div["title"]
                elif "SkyBlue" in div["style"]:
                    row[columns[2]] = div["title"]
                elif "Tomato" in div["style"]:
                    row[columns[3]] = div["title"]
        else:
            # or add data in order of processing
            row_data.append(tag.text)
            
    # match ordered data with column names and fuse together dicts
    row = {**dict(zip(columns[0:1] + columns[-2:], row_data)), **row}
    # replace date string with datetime object
    row[columns[0]] = row_date
    
    # convert strings to integers and replace any not found data
    for column in columns:
        if "date" in column:
            continue
            
        if column not in row.keys() or row[column].strip() == "":
            row[column] = 0
            
        if type(row[column]) is str:
            row[column] = paren_pattern.sub("", row[column])
            row[column] = comma_pattern.sub("", row[column])
            row[column] = row[column].strip()
            row[column] = int(row[column])

    rows.append(row)
    
    # end pull out data
    
    date = row_date

In [5]:
df = pd.DataFrame(rows)

In [6]:
# forward fill data from previous days to days with no update
df = df.fillna(method="ffill")

In [7]:
df.dtypes

date          datetime64[ns]
#cases               float64
#deaths              float64
deaths               float64
recoveries           float64
actives              float64
dtype: object