# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [1]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: MM/DD/YYYY"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [2]:
# TODO: Automate this with Selenium.

text_copy = """
Harris County

4,097

Dallas County

1,986

Tarrant County

990

Travis County

977

Bexar County

890

Fort Bend County

627

Denton County

547

Collin County

494

Galveston County

401

El Paso County

393

Lubbock County

367

Montgomery County

331

Brazoria County

272

Cameron County

254

Webb County

239

Hidalgo County

225

Brazos County

151

Jefferson County

143

Williamson County

128

Bell County

115

Hays County

109

Smith County

108

Potter County

92

Victoria County

89

Nueces County

83

Ellis County

81

Randall County

80

Taylor County

79

McLennan County

74

Coryell County

70

Nacogdoches County

69

Hardin County

66

Moore County

59

Walker County

58

Wichita County

57

Guadalupe County

53

Shelby County

51

Matagorda County

50

Ector County

48

Gregg County

48

Orange County

47

Washington County

46

Bowie County

45

Comal County

43

Tom Green County

40

Bastrop County

37

Johnson County

37

Midland County

37

Harrison County

35

Chambers County

33

Kaufman County

33

Panola County

32

Wharton County

32

Rockwall County

30

Hunt County

25

Liberty County

25

Rusk County

24

Donley County

23

Waller County

22

Angelina County

20

Andrews County

19

Calhoun County

19

Grayson County

19

Navarro County

18

Parker County

17

Gray County

16

Hockley County

15

Hood County

15

Medina County

15

Kendall County

14

Polk County

14

Fayette County

13

Wilson County

13

Austin County

12

Brown County

12

Erath County

12

Henderson County

12

Val Verde County

12

Castro County

11

Deaf Smith County

11

Hale County

11

Limestone County

11

San Augustine County

11

Van Zandt County

11

De Witt County

10

Maverick County

10

San Jacinto County

10

Terry County

10

Atascosa County

9

Grimes County

9

Hill County

9

Upshur County

9

Burleson County

8

Cherokee County

8

Dawson County

8

Milam County

8

San Patricio County

8

Titus County

8

Burnet County

7

Caldwell County

7

Colorado County

7

Lamar County

7

Starr County

7

Trinity County

7

Willacy County

7

Wise County

7

Anderson County

6

Camp County

6

Hutchinson County

6

Jasper County

6

Palo Pinto County

6

Uvalde County

6

Wood County

6

Cass County

5

Fannin County

5

Goliad County

5

Lynn County

5

Montague County

5

Pecos County

5

Blanco County

4

Hopkins County

4

Jackson County

4

Kerr County

4

Lavaca County

4

Leon County

4

Sherman County

4

Swisher County

4

Tyler County

4

Young County

4

Zapata County

4

Bandera County

3

Bee County

3

Clay County

3

Comanche County

3

Eastland County

3

Hamilton County

3

Jack County

3

Jones County

3

Karnes County

3

Kleberg County

3

Live Oak County

3

Llano County

3

McCulloch County

3

Marion County

3

Morris County

3

Oldham County

3

Aransas County

2

Cooke County

2

Crane County

2

Dallam County

2

Floyd County

2

Gaines County

2

Gonzales County

2

Jim Wells County

2

Lamb County

2

Lampasas County

2

Lee County

2

Martin County

2

Newton County

2

Rains County

2

Robertson County

2

Winkler County

2

Armstrong County

1

Bosque County

1

Callahan County

1

Concho County

1

Crosby County

1

Delta County

1

Dickens County

1

Dimmit County

1

Duval County

1

Falls County

1

Franklin County

1

Freestone County

1

Frio County

1

Gillespie County

1

Hansford County

1

Hemphill County

1

Howard County

1

Knox County

1

La Salle County

1

Madison County

1

Mason County

1

Mitchell County

1

Motley County

1

Red River County

1

Roberts County

1

Sabine County

1

Scurry County

1

Stephens County

1

Wilbarger County

1
"""

# Step 2: Clean up the text I copied and convert it into a proper DataFrame object.

In [3]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2].replace(",", "")
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Harris,4/16/2020,4097
1,Dallas,4/16/2020,1986
2,Tarrant,4/16/2020,990
3,Travis,4/16/2020,977
4,Bexar,4/16/2020,890
...,...,...,...
186,Roberts,4/16/2020,1
187,Sabine,4/16/2020,1
188,Scurry,4/16/2020,1
189,Stephens,4/16/2020,1


# Step 3: Add the latitude and longitude to each row.

In [4]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

df_counties = pd.read_csv("county_coords.csv")

def get_coordinates(county_name):
    global df_counties
    exists = len(df_counties.loc[df_counties["county"] == county_name]) != 0
    if exists:
        return [{"latitude":r[0], "longitude":r[1]} for r in df_counties.loc[df_counties["county"] == county_name][["latitude", "longitude"]].values][0]
    else:
        point = geocode(county_name + " County, TX")
        df_county = pd.DataFrame([[county_name, point.latitude, point.longitude]], columns=["county", "latitude", "longitude"])
        df_counties = df_counties.append(df_county)
        return {"latitude":point.latitude, "longitude":point.longitude}

df_num_cases["point"] = (df_num_cases["county"]).apply(get_coordinates) #geocode)
##df_num_cases["point"] = (df_num_cases["county"]).apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
##df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
##    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
##    for p in df_num_cases["point"].tolist()])
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p["latitude"], p["longitude"]) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)

# Save the county coordinates to preserve any new data.
df_counties.set_index("county", inplace=True)
df_counties.to_csv("county_coords.csv")

# Step 4: Concatenate today's data with the full dataset.

In [5]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [6]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.

In [7]:
# This chunk of code will fix the date column to make it text-sortable, by zero-padding the pieces of the date.
# If this virus continues into 2021 I will need to amend the date column to be YYYYMMDD instead of MM/DD/YYYY.

import pandas as pd

def fix_date(date_text):
    pieces = date_text.split("/")
    pieces[0] = pieces[0].zfill(2)
    pieces[1] = pieces[1].zfill(2)
    pieces[2] = pieces[2].zfill(2)
    return pieces[0] + "/" + pieces[1] + "/" + pieces[2]

df_data = pd.read_csv("data.csv")
df_data.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_data = df_data.drop(columns=["unnamed"], axis=1)

df_data["date"] = df_data["date"].apply(fix_date)

df_data.to_csv("data.csv")