In [1]:
# Inspired by: Python Web Scraping with Beautiful Soup and Regex
# https://www.youtube.com/watch?v=F1kZ39SvuGE

# Practice web scraping using BS4
# Website: CDC Cornoavirus Website

import requests
from bs4 import BeautifulSoup

In [2]:
# get the data
data = requests.get("https://www.cdc.gov/coronavirus/2019-ncov/cases-in-us.html")

# load data into bs4
soup = BeautifulSoup(data.text, "html.parser")

In [3]:
# Look for the table with 2019coronavirus
div = soup.find('div', {'class': '2019coronavirus'})

category_list = []
count_list = []

for tr in div.find_all('td'):
    # The current loop's text, it can be category or number
    new_data = tr.text
    
    # Check if this is a count or not
    if new_data.isnumeric():
        print("Number:".ljust(10), new_data)
        count_list.append(int(new_data))
    else:
        print("String:".ljust(10), new_data)
        category_list.append(new_data)
    
# Review the lists
print()
print(category_list)
print(count_list)

String:    Travel-related
Number:    24
String:    Person-to-person spread
Number:    16
String:    Under Investigation
Number:    40
String:    Total cases
Number:    80

['Travel-related', 'Person-to-person spread', 'Under Investigation', 'Total cases']
[24, 16, 40, 80]


In [4]:
import pandas as pd

headers = ['category', 'count']

# Creating the list of list here
data_list = []
for i in range(len(category_list)):
    data_list.append([category_list[i], count_list[i]])

cov19_df = pd.DataFrame(data_list, columns = headers)

In [5]:
# Review the dataframe we created
cov19_df

Unnamed: 0,category,count
0,Travel-related,24
1,Person-to-person spread,16
2,Under Investigation,40
3,Total cases,80


In [6]:
# Simple Bar Graph for today's data
ax = cov19_df.plot.bar(x='category', y='count', rot=0, figsize=(10,5))

In [7]:
# Writing the data to an csv (collecting data everyday)
data_list

[['Travel-related', 24],
 ['Person-to-person spread', 16],
 ['Under Investigation', 40],
 ['Total cases', 80]]

In [8]:
# Get today's date in string 2020-03-04
from datetime import date

# 2020/03/04 format date string
today = date.today().strftime("%Y/%m/%d")

with open("db_coronavirus.csv", 'a') as out:
    for data in data_list:
        cat = data[0]
        count = data[1]
        
        # construct a comma delimited data
        # 2020-03-04,Travel-related,24
        out_str = ','.join([today, cat, str(count)])
        print(out_str)
        out.write(out_str + '\n')

2020/03/04,Travel-related,24
2020/03/04,Person-to-person spread,16
2020/03/04,Under Investigation,40
2020/03/04,Total cases,80


In [9]:
# Once George collected a week or month worth of data
# Add more code here to do analysis