# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [1]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: MM/DD/YYYY"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [10]:
text_copy = """
Harris County

526

Dallas County

488

Travis County

200

Denton County

165

Bexar County

157

Tarrant County

139

Collin County

134

Fort Bend County

119

Galveston County

70

Montgomery County

66

Brazoria County

61

Lubbock County

51

Brazos County

44

El Paso County

40

Williamson County

37

McLennan County

36

Smith County

32

Webb County

32

Bell County

28

Hidalgo County

28

Wichita County

28

Nueces County

22

Cameron County

20

Jefferson County

18

Hays County

16

Ellis County

15

Guadalupe County

13

Matagorda County

13

Midland County

13

Comal County

9

Hardin County

9

Castro County

8

Johnson County

8

Washington County

8

Hockley County

7

Kendall County

7

Taylor County

7

Bastrop County

6

Grayson County

6

Randall County

6

Victoria County

6

Wharton County

6

Chambers County

5

Gregg County

5

Orange County

5

Potter County

5

Tom Green County

5

Val Verde County

5

Bowie County

4

Parker County

4

Rockwall County

4

Angelina County

3

Brown County

3

Calhoun County

3

De Witt County

3

Eastland County

3

Ector County

3

Hood County

3

Hunt County

3

Kaufman County

3

Lamar County

3

Llano County

3

Medina County

3

Navarro County

3

Rusk County

3

Terry County

3

Walker County

3

Waller County

3

Aransas County

2

Atascosa County

2

Austin County

2

Burnet County

2

Cass County

2

Deaf Smith County

2

Fayette County

2

Grimes County

2

Hopkins County

2

Karnes County

2

Liberty County

2

Limestone County

2

Lynn County

2

Milam County

2

Nacogdoches County

2

Oldham County

2

Robertson County

2

Shelby County

2

Starr County

2

Upshur County

2

Uvalde County

2

Van Zandt County

2

Willacy County

2

Wilson County

2

Blanco County

1

Burleson County

1

Caldwell County

1

Cherokee County

1

Coryell County

1

Crane County

1

Dawson County

1

Erath County

1

Falls County

1

Fannin County

1

Franklin County

1

Gaines County

1

Hale County

1

Harrison County

1

Hill County

1

Jackson County

1

Jim Wells County

1

Kleberg County

1

Lamb County

1

Lavaca County

1

Leon County

1

Martin County

1

Maverick County

1

Montague County

1

Moore County

1

Morris County

1

Polk County

1

San Jacinto County

1

San Patricio County

1

Swisher County

1

Wise County

1

Young County

1"""

# Step 2: Clean up the text I copied and convert it into a proper DataFrame object.

In [12]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2]
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Harris,3/30/2020,526
1,Dallas,3/30/2020,488
2,Travis,3/30/2020,200
3,Denton,3/30/2020,165
4,Bexar,3/30/2020,157
...,...,...,...
119,San Jacinto,3/30/2020,1
120,San Patricio,3/30/2020,1
121,Swisher,3/30/2020,1
122,Wise,3/30/2020,1


# Step 3: Add the latitude and longitude to each row.

In [14]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

df_num_cases["point"] = (df_num_cases["county"] + " County, TX").apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)


# Step 4: Concatenate today's data with the full dataset.

In [15]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [16]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.