# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [1]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: MM/DD/YYYY"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [9]:
# TODO: Automate this with Selenium.

text_copy = """
Harris County

1,106

Dallas County

921

Travis County

430

Tarrant County

383

Bexar County

342

Denton County

273

Collin County

256

Fort Bend County

255

Lubbock County

138

Brazoria County

136

Galveston County

136

Montgomery County

130

El Paso County

96

Hidalgo County

86

Webb County

83

Brazos County

72

Williamson County

71

Cameron County

62

Smith County

57

Bell County

51

McLennan County

50

Hays County

47

Nueces County

47

Wichita County

46

Matagorda County

33

Victoria County

32

Ellis County

29

Jefferson County

29

Guadalupe County

26

Potter County

24

Randall County

24

Taylor County

23

Ector County

22

Midland County

22

Johnson County

21

Comal County

18

Bowie County

17

Tom Green County

17

Washington County

17

Nacogdoches County

15

Hardin County

14

Gregg County

13

Grayson County

12

Orange County

12

Chambers County

11

Rockwall County

11

Rusk County

11

Shelby County

11

Angelina County

10

Castro County

10

Fayette County

10

Kaufman County

10

Hood County

9

Kendall County

9

Wharton County

9

Calhoun County

8

Hunt County

8

Walker County

8

Waller County

8

Bastrop County

7

Erath County

7

Hockley County

7

Polk County

7

De Witt County

6

Hale County

6

Moore County

6

Parker County

6

Wilson County

6

Cherokee County

5

Deaf Smith County

5

Donley County

5

Gray County

5

Harrison County

5

Hill County

5

Lamar County

5

Liberty County

5

Navarro County

5

Starr County

5

Val Verde County

5

Willacy County

5

Austin County

4

Brown County

4

Burnet County

4

Cass County

4

Coryell County

4

Maverick County

4

Panola County

4

San Augustine County

4

San Patricio County

4

Terry County

4

Uvalde County

4

Van Zandt County

4

Caldwell County

3

Dawson County

3

Eastland County

3

Grimes County

3

Hopkins County

3

Jackson County

3

Limestone County

3

Live Oak County

3

Llano County

3

Lynn County

3

Medina County

3

Upshur County

3

Wise County

3

Young County

3

Atascosa County

2

Blanco County

2

Burleson County

2

Colorado County

2

Crane County

2

Fannin County

2

Gillespie County

2

Hutchinson County

2

Jasper County

2

Jim Wells County

2

Karnes County

2

Kerr County

2

Kleberg County

2

Lavaca County

2

Lee County

2

Leon County

2

Milam County

2

Oldham County

2

Robertson County

2

Trinity County

2

Anderson County

1

Andrews County

1

Camp County

1

Clay County

1

Comanche County

1

Falls County

1

Franklin County

1

Gaines County

1

Goliad County

1

Gonzales County

1

Hemphill County

1

Henderson County

1

Lamb County

1

Lampasas County

1

Martin County

1

Mason County

1

Montague County

1

Morris County

1

Newton County

1

Palo Pinto County

1

San Jacinto County

1

Swisher County

1

Titus County

1

Tyler County

1

Wood County

1
"""

# Step 2: Clean up the text I copied and convert it into a proper DataFrame object.

In [10]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2].replace(",", "")
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Harris,4/4/2020,1106
1,Dallas,4/4/2020,921
2,Travis,4/4/2020,430
3,Tarrant,4/4/2020,383
4,Bexar,4/4/2020,342
...,...,...,...
146,San Jacinto,4/4/2020,1
147,Swisher,4/4/2020,1
148,Titus,4/4/2020,1
149,Tyler,4/4/2020,1


# Step 3: Add the latitude and longitude to each row.

In [6]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

df_counties = pd.read_csv("county_coords.csv")

def get_coordinates(county_name):
    global df_counties
    exists = len(df_counties.loc[df_counties["county"] == county_name]) != 0
    if exists:
        return [{"latitude":r[0], "longitude":r[1]} for r in df_counties.loc[df_counties["county"] == county_name][["latitude", "longitude"]].values][0]
    else:
        point = geocode(county_name + " County, TX")
        df_county = pd.DataFrame([[county_name, point.latitude, point.longitude]], columns=["county", "latitude", "longitude"])
        df_counties = df_counties.append(df_county)
        return {"latitude":point.latitude, "longitude":point.longitude}

df_num_cases["point"] = (df_num_cases["county"]).apply(get_coordinates) #geocode)
##df_num_cases["point"] = (df_num_cases["county"]).apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
##df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
##    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
##    for p in df_num_cases["point"].tolist()])
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p["latitude"], p["longitude"]) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)

# Save the county coordinates to preserve any new data.
df_counties.set_index("county", inplace=True)
df_counties.to_csv("county_coords.csv")

# Step 4: Concatenate today's data with the full dataset.

In [7]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [8]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.