# Data Loader

It'll make things look a bit cleaner to load the data in a separate script from the report generator.


In [8]:
# Step 0: Import all the things!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import geopy
from geopy.extra.rate_limiter import RateLimiter
from IPython.display import HTML, Markdown, clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import urllib3

%run controls.ipynb

def datestamp(date):
    """Convert a datetime object into a date stamp: MM/DD/YYYY"""
    return f"{date.month}/{date.day}/{date.year}"


# Step 1: Download a copy of the web page that contains the data I want.

The state has moved to an arcgis data source.  Looks a lot nicer than an html table, but it's also harder to scrape.  I'll try to figure out how to connect to that data later, but I was able to copy and paste this text from here: https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html#/ed483ecd702b4298ab01e8b9cafc8b83

In [9]:
# TODO: Automate this with Selenium.

text_copy = """
Harris County

6,161

Dallas County

3,352

Tarrant County

2,149

Travis County

1,591

Bexar County

1,326

Fort Bend County

1,018

El Paso County

887

Denton County

748

Collin County

710

Montgomery County

582

Galveston County

572

Lubbock County

516

Potter County

499

Brazoria County

485

Cameron County

401

Webb County

358

Moore County

327

Hidalgo County

324

Jefferson County

305

Williamson County

293

Taylor County

291

Walker County

245

Randall County

213

Brazos County

193

Bell County

189

Hays County

165

Nacogdoches County

157

Coryell County

155

Ellis County

148

Smith County

142

Washington County

142

Victoria County

132

Panola County

107

Shelby County

106

Harrison County

102

Hardin County

99

Nueces County

99

McLennan County

88

Kaufman County

86

Ector County

81

Guadalupe County

80

Midland County

76

Rockwall County

75

Bastrop County

73

Gregg County

73

Bowie County

70

Orange County

70

Johnson County

69

Wichita County

65

Matagorda County

60

Comal County

54

Angelina County

53

Lamar County

51

Tom Green County

49

Gray County

47

Hunt County

44

Liberty County

42

Chambers County

41

Jones County

37

Wharton County

37

Brown County

36

Rusk County

35

Waller County

32

Wilson County

32

Deaf Smith County

31

Grayson County

31

Calhoun County

30

Navarro County

30

Parker County

30

Anderson County

29

Hale County

25

Henderson County

25

Ochiltree County

25

Dawson County

24

Donley County

24

Gonzales County

20

Hockley County

20

Medina County

20

Andrews County

19

Atascosa County

19

Burnet County

19

Grimes County

19

Polk County

19

Fannin County

18

Hood County

18

Wise County

17

Hill County

16

Jasper County

16

Kendall County

16

Maverick County

16

Milam County

16

San Augustine County

16

Titus County

16

De Witt County

15

Mason County

15

Sherman County

15

Caldwell County

14

Cherokee County

14

Fayette County

14

Hutchinson County

14

Upshur County

14

Van Zandt County

14

Austin County

13

Cass County

13

Castro County

13

Marion County

13

Willacy County

13

Burleson County

12

Erath County

12

Limestone County

12

San Patricio County

12

Val Verde County

12

Terry County

11

Colorado County

10

Dallam County

10

Pecos County

9

San Jacinto County

9

Starr County

9

Swisher County

9

Kleberg County

8

Trinity County

8

Wood County

8

Cooke County

7

Frio County

7

Goliad County

7

Palo Pinto County

7

Wheeler County

7

Zapata County

7

Bandera County

6

Bee County

6

Blanco County

6

Camp County

6

Hansford County

6

Jackson County

6

Lavaca County

6

Montague County

6

Parmer County

6

Tyler County

6

Uvalde County

6

Hamilton County

5

Houston County

5

Kerr County

5

Leon County

5

Live Oak County

5

Lynn County

5

Morris County

5

Bosque County

4

Falls County

4

Hartley County

4

Hopkins County

4

Howard County

4

Jack County

4

Young County

4

Clay County

3

Comanche County

3

Eastland County

3

Floyd County

3

Freestone County

3

Jim Hogg County

3

Jim Wells County

3

Karnes County

3

Lamb County

3

Lampasas County

3

Llano County

3

McCulloch County

3

Oldham County

3

Robertson County

3

Winkler County

3

Aransas County

2

Armstrong County

2

Callahan County

2

Carson County

2

Cottle County

2

Crane County

2

Crosby County

2

Duval County

2

Gaines County

2

Lee County

2

Lipscomb County

2

Martin County

2

Newton County

2

Rains County

2

Roberts County

2

Scurry County

2

Brooks County

1

Childress County

1

Cochran County

1

Concho County

1

Delta County

1

Dickens County

1

Dimmit County

1

Franklin County

1

Garza County

1

Gillespie County

1

Glasscock County

1

Hemphill County

1

Knox County

1

La Salle County

1

Madison County

1

Mitchell County

1

Motley County

1

Nolan County

1

Red River County

1

Runnels County

1

Sabine County

1

Stephens County

1

Wilbarger County

1

Yoakum County

1

Zavala County

1
"""

# Step 2: Clean up the text I copied and convert it into a proper DataFrame object.

In [10]:
today = dt.datetime.today()
today_text = datestamp(today)

num_cases = []
text_list = text_copy.strip().split('\n')
for n in range(0, len(text_list), 4):
    num_cases.append({
        "county": text_list[n].replace(" County", ""),
        "date": today_text,
        "num_cases": text_list[n + 2].replace(",", "")
    })

df_num_cases = pd.DataFrame(num_cases)
df_num_cases

Unnamed: 0,county,date,num_cases
0,Harris,4/30/2020,6161
1,Dallas,4/30/2020,3352
2,Tarrant,4/30/2020,2149
3,Travis,4/30/2020,1591
4,Bexar,4/30/2020,1326
...,...,...,...
204,Sabine,4/30/2020,1
205,Stephens,4/30/2020,1
206,Wilbarger,4/30/2020,1
207,Yoakum,4/30/2020,1


# Step 3: Add the latitude and longitude to each row.

In [11]:
locator = geopy.Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

df_counties = pd.read_csv("county_coords.csv")

def get_coordinates(county_name):
    global df_counties
    exists = len(df_counties.loc[df_counties["county"] == county_name]) != 0
    if exists:
        return [{"latitude":r[0], "longitude":r[1]} for r in df_counties.loc[df_counties["county"] == county_name][["latitude", "longitude"]].values][0]
    else:
        point = geocode(county_name + " County, TX")
        df_county = pd.DataFrame([[county_name, point.latitude, point.longitude]], columns=["county", "latitude", "longitude"])
        df_counties = df_counties.append(df_county)
        return {"latitude":point.latitude, "longitude":point.longitude}

df_num_cases["point"] = (df_num_cases["county"]).apply(get_coordinates) #geocode)
##df_num_cases["point"] = (df_num_cases["county"]).apply(geocode)

# I am substituting (28.082612, -94.936773) for the *official* coordinates of the Gulf of Mexico to better position it on the map.
##df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
##    (p.latitude, p.longitude) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
##    for p in df_num_cases["point"].tolist()])
df_num_cases[['latitude', 'longitude']] = pd.DataFrame([
    (p["latitude"], p["longitude"]) if p != None else (28.082612, -94.936773) # list(locator.geocode("Gulf of Mexico"))[1]
    for p in df_num_cases["point"].tolist()])
df_num_cases = df_num_cases.drop(["point"], axis=1)

# Save the county coordinates to preserve any new data.
df_counties.set_index("county", inplace=True)
df_counties.to_csv("county_coords.csv")

# Step 4: Concatenate today's data with the full dataset.

In [12]:
# TODO: Running this more than once will add duplicates to the data.  I can make this nicer by performing some kind of distinct() function on the data.

df_num_cases = pd.concat([pd.read_csv("data.csv"), df_num_cases])

# Step 5: Save the new dataset.

In [13]:
df_num_cases.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_num_cases = df_num_cases.drop(columns=["unnamed"], axis=1)

# TODO: Always make a backup before running this.
df_num_cases.to_csv("data.csv")

# Now we are ready to play with it.

# Step 6: Make the dates text-sortable.

In [14]:
# This chunk of code will fix the date column to make it text-sortable, by zero-padding the pieces of the date.
# If this virus continues into 2021 I will need to amend the date column to be YYYYMMDD instead of MM/DD/YYYY.

import pandas as pd

def fix_date(date_text):
    pieces = date_text.split("/")
    pieces[0] = pieces[0].zfill(2)
    pieces[1] = pieces[1].zfill(2)
    pieces[2] = pieces[2].zfill(2)
    return pieces[0] + "/" + pieces[1] + "/" + pieces[2]

df_data = pd.read_csv("data.csv")
df_data.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_data = df_data.drop(columns=["unnamed"], axis=1)

df_data["date"] = df_data["date"].apply(fix_date)

df_data.to_csv("data.csv")

In [9]:
# This bit will swap out 2 days; important if I load the data the morning after.

return

import pandas as pd

old_date = "04/19/2020"
new_date = "04/17/2020"

def fix_date(date_text):
    if date_text == old_date:
        return new_date
    else:
        return date_text

df_data = pd.read_csv("data.csv")
df_data.columns = ["unnamed", "county", "date", "num_cases", "latitude", "longitude"]
df_data = df_data.drop(columns=["unnamed"], axis=1)

df_data["date"] = df_data["date"].apply(fix_date)

df_data.to_csv("data.csv")

In [9]:
from sqlalchemy import inspect
import sqlalchemy as db
import pyodbc
import pandas as pd

%reload_ext sql

%sql mssql+pyodbc://Trey:elliott0!@47.222.182.190:1433/Covid19={sqlsrv}

ModuleNotFoundError: No module named 'pyodbc'