# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [20]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: YYYYMMDD"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [11]:
text_copy = """Dallas County

169

Harris County

134

Travis County

98

Tarrant County

71

Bexar County

69

Fort Bend County

46

Collin County

45

Denton County

30

McLennan County

23

Montgomery County

23

Galveston County

21

Brazoria County

19

Williamson County

19

Bell County

18

Brazos County

16

Smith County

14

Lubbock County

12

El Paso County

11

Nueces County

10

Webb County

8

Hays County

7

Cameron County

6

Ellis County

6

Matagorda County

6

Comal County

5

Wichita County

5

Guadalupe County

4

Castro County

3

Hockley County

3

Jefferson County

3

Johnson County

3

Midland County

3

Wharton County

3

Atascosa County

2

Brown County

2

Deaf Smith County

2

Eastland County

2

Grayson County

2

Grimes County

2

Hardin County

2

Hidalgo County

2

Kaufman County

2

Kendall County

2

Milam County

2

Potter County

2

Austin County

1

Bastrop County

1

Blanco County

1

Bowie County

1

Burnet County

1

Cass County

1

Chambers County

1

Coryell County

1

Crane County

1

De Witt County

1

Erath County

1

Falls County

1

Fannin County

1

Fayette County

1

Gaines County

1

Gregg County

1

Hopkins County

1

Hunt County

1

Lamar County

1

Lavaca County

1

Liberty County

1

Limestone County

1

Llano County

1

Maverick County

1

Medina County

1

Morris County

1

Oldham County

1

Orange County

1

Parker County

1

Robertson County

1

Rusk County

1

Terry County

1

Tom Green County

1

Upshur County

1

Van Zandt County

1

Walker County

1

Wilson County

1"""

# Step 2: Clean up the text I copied and convert it into a property DataFrame object.

In [22]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2]
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Dallas,3/25/2020,169
1,Harris,3/25/2020,134
2,Travis,3/25/2020,98
3,Tarrant,3/25/2020,71
4,Bexar,3/25/2020,69
...,...,...,...
77,Tom Green,3/25/2020,1
78,Upshur,3/25/2020,1
79,Van Zandt,3/25/2020,1
80,Walker,3/25/2020,1


In [23]:
# Step 3: Add the latitude and longitude to each row.

locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.1)

df_num_cases["point"] = (df_num_cases["county"] + ", TX").apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)



In [24]:
# Step 4: Concatenate today's data with the full dataset.
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])


In [25]:
# Step 5: Save the new dataset.

df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.