# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [42]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: MM/DD/YYYY"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [43]:
# TODO: Automate this with Selenium.

text_copy = """
Harris County

955

Dallas County

831

Travis County

351

Tarrant County

325

Bexar County

254

Denton County

254

Collin County

227

Fort Bend County

221

Galveston County

130

Brazoria County

123

Lubbock County

117

Montgomery County

110

Hidalgo County

79

El Paso County

78

Brazos County

68

Webb County

65

Williamson County

63

Cameron County

55

Smith County

53

Bell County

51

McLennan County

49

Hays County

45

Wichita County

44

Nueces County

41

Jefferson County

40

Matagorda County

32

Ellis County

27

Victoria County

27

Guadalupe County

23

Potter County

21

Randall County

21

Midland County

19

Comal County

18

Johnson County

18

Taylor County

18

Washington County

17

Ector County

15

Hardin County

15

Grayson County

12

Nacogdoches County

12

Chambers County

11

Bastrop County

10

Bowie County

10

Fayette County

10

Rockwall County

10

Shelby County

10

Tom Green County

10

Castro County

9

Kaufman County

9

Wharton County

9

Angelina County

8

Calhoun County

8

Gregg County

8

Hood County

8

Hunt County

8

Kendall County

8

Hockley County

7

Orange County

7

Polk County

7

Rusk County

7

Waller County

7

De Witt County

6

Hale County

6

Moore County

6

Wilson County

6

Cherokee County

5

Donley County

5

Harrison County

5

Liberty County

5

Navarro County

5

Parker County

5

Starr County

5

Val Verde County

5

Walker County

5

Willacy County

5

Austin County

4

Brown County

4

Burnet County

4

Cass County

4

Colorado County

4

Maverick County

4

Panola County

4

San Augustine County

4

Terry County

4

Uvalde County

4

Dawson County

3

Deaf Smith County

3

Eastland County

3

Erath County

3

Grimes County

3

Hill County

3

Hopkins County

3

Jackson County

3

Lamar County

3

Limestone County

3

Live Oak County

3

Llano County

3

Lynn County

3

Medina County

3

San Patricio County

3

Upshur County

3

Van Zandt County

3

Wise County

3

Young County

3

Atascosa County

2

Caldwell County

2

Coryell County

2

Crane County

2

Fannin County

2

Jasper County

2

Karnes County

2

Kerr County

2

Lee County

2

Leon County

2

Milam County

2

Oldham County

2

Robertson County

2

Anderson County

1

Blanco County

1

Burleson County

1

Camp County

1

Clay County

1

Comanche County

1

Falls County

1

Franklin County

1

Gaines County

1

Gillespie County

1

Goliad County

1

Gonzales County

1

Gray County

1

Henderson County

1

Jim Wells County

1

Kleberg County

1

Lamb County

1

Lampasas County

1

Lavaca County

1

Martin County

1

Montague County

1

Morris County

1

Newton County

1

Palo Pinto County

1

San Jacinto County

1

Swisher County

1

Titus County

1

Wood County

1
"""

# Step 2: Clean up the text I copied and convert it into a proper DataFrame object.

In [44]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2]
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Harris,4/3/2020,955
1,Dallas,4/3/2020,831
2,Travis,4/3/2020,351
3,Tarrant,4/3/2020,325
4,Bexar,4/3/2020,254
...,...,...,...
140,Palo Pinto,4/3/2020,1
141,San Jacinto,4/3/2020,1
142,Swisher,4/3/2020,1
143,Titus,4/3/2020,1


# Step 3: Add the latitude and longitude to each row.

In [45]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

df_counties = pd.read_csv("county_coords.csv")

def get_coordinates(county_name):
    global df_counties
    exists = len(df_counties.loc[df_counties["county"] == county_name]) != 0
    if exists:
        return [{"latitude":r[0], "longitude":r[1]} for r in df_counties.loc[df_counties["county"] == county_name][["latitude", "longitude"]].values][0]
    else:
        point = geocode(county_name + " County, TX")
        df_county = pd.DataFrame([[county_name, point.latitude, point.longitude]], columns=["county", "latitude", "longitude"])
        df_counties = df_counties.append(df_county)
        return {"latitude":point.latitude, "longitude":point.longitude}

df_num_cases["point"] = (df_num_cases["county"]).apply(get_coordinates) #geocode)
##df_num_cases["point"] = (df_num_cases["county"]).apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
##df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
##    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
##    for p in df_num_cases["point"].tolist()])
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p["latitude"], p["longitude"]) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)

# Save the county coordinates to preserve any new data.
df_counties.set_index("county", inplace=True)
df_counties.to_csv("county_coords.csv")

# Step 4: Concatenate today's data with the full dataset.

In [46]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [47]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.