# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [12]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

# Step 1: Download a copy of the web page that contains the data I want.

# The state's SSL certificate is expired; I'm going to ignore that particular problem.
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

response = requests.get("https://dshs.texas.gov/news/updates.shtm", verify=False)
source = BeautifulSoup(response.text, "html.parser")


In [13]:
# Step 2: Grab the <table> out of the page and turn it into a DataFrame.

tables = source.find_all("table")
county_table = [t for t in tables if t.has_attr("summary") and t.attrs["summary"] == "COVID-19 Cases in Texas Counties"][0]

row_groups = [tr.find_all("td") for tr in county_table.find_all("tr")][1:]

today = dt.datetime.today()
today_text = f"{today.month}/{today.day}/{today.year}"

num_cases = [{
    "county": td[0].text,
    "date": today_text,
    "num_cases": td[1].text
} for td in row_groups]

df_num_cases = pd.DataFrame(num_cases)


IndexError: list index out of range

In [8]:
# Step 3: Add the latitude and longitude to each row.

locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.1)

df_num_cases["point"] = (df_num_cases["county"] + ", TX").apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)



In [9]:
# Step 4: Concatenate today's data with the full dataset.
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])


In [10]:
# Step 5: Save the new dataset.

df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.