# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [8]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: YYYYMMDD"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [10]:
text_copy = """Dallas County

367

Harris County

203

Travis County

137

Bexar County

113

Tarrant County

100

Collin County

88

Fort Bend County

86

Denton County

83

Montgomery County

41

Galveston County

40

Brazoria County

39

McLennan County

33

Brazos County

30

Williamson County

27

Lubbock County

23

El Paso County

21

Smith County

21

Bell County

20

Nueces County

18

Hays County

13

Webb County

13

Wichita County

12

Hidalgo County

11

Cameron County

10

Ellis County

10

Comal County

9

Matagorda County

9

Guadalupe County

8

Jefferson County

8

Castro County

7

Johnson County

6

Midland County

6

Hockley County

5

Gregg County

4

Hardin County

4

Kendall County

4

Brown County

3

De Witt County

3

Medina County

3

Rockwall County

3

Victoria County

3

Wharton County

3

Angelina County

2

Atascosa County

2

Bastrop County

2

Bowie County

2

Calhoun County

2

Chambers County

2

Deaf Smith County

2

Eastland County

2

Grayson County

2

Grimes County

2

Hood County

2

Kaufman County

2

Liberty County

2

Llano County

2

Milam County

2

Parker County

2

Rusk County

2

Starr County

2

Tom Green County

2

Val Verde County

2

Walker County

2

Austin County

1

Blanco County

1

Burnet County

1

Cass County

1

Cherokee County

1

Coryell County

1

Crane County

1

Erath County

1

Falls County

1

Fannin County

1

Fayette County

1

Gaines County

1

Hale County

1

Harrison County

1

Hopkins County

1

Hunt County

1

Jackson County

1

Karnes County

1

Lamar County

1

Lavaca County

1

Limestone County

1

Martin County

1

Maverick County

1

Montague County

1

Morris County

1

Nacogdoches County

1

Navarro County

1

Oldham County

1

Orange County

1

Potter County

1

Robertson County

1

San Patricio County

1

Shelby County

1

Taylor County

1

Terry County

1

Upshur County

1

Uvalde County

1

Van Zandt County

1

Willacy County

1

Wilson County

1

Yoakum County

1

Young County

1"""

# Step 2: Clean up the text I copied and convert it into a property DataFrame object.

In [11]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2]
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Dallas,3/27/2020,367
1,Harris,3/27/2020,203
2,Travis,3/27/2020,137
3,Bexar,3/27/2020,113
4,Tarrant,3/27/2020,100
...,...,...,...
100,Van Zandt,3/27/2020,1
101,Willacy,3/27/2020,1
102,Wilson,3/27/2020,1
103,Yoakum,3/27/2020,1


# Step 3: Add the latitude and longitude to each row.

In [12]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.1)

df_num_cases["point"] = (df_num_cases["county"] + ", TX").apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)



# Step 4: Concatenate today's data with the full dataset.

In [13]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [14]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.