## ETL Project: USA HOSPITALS
#### Dependencies and Setup

In [2]:
import requests
import psycopg2
import pandas as pd
import config as creds
import sqlalchemy as sqlalchemy_package

#### Extracting Data via API interaction 
* ESRI Dataset: Definitive Healthcare: USA Hospital Beds

In [3]:
# Query URL
url = f"https://services7.arcgis.com/LXCny1HyhQCUSueu/arcgis/rest/services/Definitive_Healthcare_USA_Hospital_Beds/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"

In [4]:
# Get Hospitals JSON data
response = requests.get(url).json()
hospitals_data = response['features']
hospitals_data[1]['attributes']

{'OBJECTID': 2,
 'HOSPITAL_NAME': 'Southern Arizona VA Health Care System',
 'HOSPITAL_TYPE': 'VA Hospital',
 'HQ_ADDRESS': '3601 S 6th Ave',
 'HQ_ADDRESS1': None,
 'HQ_CITY': 'Tucson',
 'HQ_STATE': 'AZ',
 'HQ_ZIP_CODE': '85723',
 'COUNTY_NAME': 'Pima',
 'STATE_NAME': 'Arizona',
 'STATE_FIPS': '04',
 'CNTY_FIPS': '019',
 'FIPS': '04019',
 'NUM_LICENSED_BEDS': 295,
 'NUM_STAFFED_BEDS': 295,
 'NUM_ICU_BEDS': 2,
 'ADULT_ICU_BEDS': 2,
 'PEDI_ICU_BEDS': None,
 'BED_UTILIZATION': None,
 'Potential_Increase_In_Bed_Capac': 0,
 'AVG_VENTILATOR_USAGE': 2}

In [5]:
# Create Lists of Data Dictionaries
hospitals = []
hospitals_beds = []
hospitals_location = []
hospitals_geometry = []
number = 1
for record in hospitals_data:
    hospital_id = str(number) + "-" + record['attributes']['FIPS']
    number += 1
    # create hospitals dataset
    row = {}
    row['id'] = hospital_id
    row['hospital_name'] = record['attributes']['HOSPITAL_NAME']
    hospitals.append(row)
    # create hospitals beds dataset
    row = {}
    row['id'] = hospital_id
    row['licensed_beds'] = record['attributes']['NUM_LICENSED_BEDS']
    row['icu_beds'] = record['attributes']['NUM_ICU_BEDS']
    row['ventilator_usage'] = record['attributes']['AVG_VENTILATOR_USAGE']
    hospitals_beds.append(row)
    # create hospitals location dataset
    row = {}
    row['id'] = hospital_id
    row['hospital_type'] = record['attributes']['HOSPITAL_TYPE']
    row['hospital_hq_address'] = record['attributes']['HQ_ADDRESS']
    row['hospital_hq_city'] = record['attributes']['HQ_CITY']
    row['hospital_hq_state'] = record['attributes']['HQ_STATE']
    row['zip_code'] = record['attributes']['HQ_ZIP_CODE']
    row['county_name'] = record['attributes']['COUNTY_NAME']
    row['state_name'] = record['attributes']['STATE_NAME']
    hospitals_location.append(row)
    # create hospitals geometry
    row = {}
    row['id'] = hospital_id
    row['lat'] = record['geometry']['y']
    row['lng'] = record['geometry']['x']
    hospitals_geometry.append(row)    

#### Transformation Data
* Converting lists of dictionaries into DataFrames

In [6]:
# Creating DataFrame for Hospitals 
hospitals_geometry_pd = pd.DataFrame(hospitals_geometry)
hospitals_geometry_pd.set_index("id", inplace=True)
hospitals_geometry_pd.head()

Unnamed: 0_level_0,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1-04013,33.495498,-112.066157
2-04019,32.181263,-110.965885
3-06019,36.773324,-119.779742
4-09009,41.2844,-72.95761
5-10003,39.740206,-75.606532


In [7]:
# Creating DataFrame for Hospitals Location Data
hospitals_location_pd = pd.DataFrame(hospitals_location)
hospitals_location_pd.set_index("id", inplace=True)
hospitals_location_pd.head()

Unnamed: 0_level_0,hospital_type,hospital_hq_address,hospital_hq_city,hospital_hq_state,zip_code,county_name,state_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1-04013,VA Hospital,650 E Indian School Rd,Phoenix,AZ,85012,Maricopa,Arizona
2-04019,VA Hospital,3601 S 6th Ave,Tucson,AZ,85723,Pima,Arizona
3-06019,VA Hospital,2615 E Clinton Ave,Fresno,CA,93703,Fresno,California
4-09009,VA Hospital,950 Campbell Ave,West Haven,CT,6516,New Haven,Connecticut
5-10003,VA Hospital,1601 Kirkwood Hwy,Wilmington,DE,19805,New Castle,Delaware


In [8]:
# Creating DataFrame for Hospitals 
hospitals_beds_pd = pd.DataFrame(hospitals_beds)
hospitals_beds_pd.set_index("id", inplace=True)
hospitals_beds_pd.head()

Unnamed: 0_level_0,licensed_beds,icu_beds,ventilator_usage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1-04013,62.0,0,0
2-04019,295.0,2,2
3-06019,54.0,2,2
4-09009,216.0,1,2
5-10003,62.0,0,1


In [9]:
# Creating DataFrame for Hospitals 
hospitals_pd = pd.DataFrame(hospitals)
hospitals_pd.set_index("id", inplace=True)
hospitals_pd

Unnamed: 0_level_0,hospital_name
id,Unnamed: 1_level_1
1-04013,Phoenix VA Health Care System (AKA Carl T Hayd...
2-04019,Southern Arizona VA Health Care System
3-06019,VA Central California Health Care System
4-09009,VA Connecticut Healthcare System - West Haven ...
5-10003,Wilmington VA Medical Center
...,...
1996-27047,Mayo Clinic Hospital - Albert Lea
1997-27049,Mayo Clinic Hospital - Red Wing (FKA Fairview ...
1998-27049,Mayo Clinic Hospital - Cannon Falls
1999-27051,Prairie Ridge Hospital and Health Services


In [10]:
# Check Number of Unique Hospitals  
hospitals_pd_list = hospitals_pd['hospital_name'].unique()
len(hospitals_pd_list)

1960

### Create database connection

In [11]:
# Create PostgreSQL database connection using credentials form config.sys
database_url = f"postgres://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5432/{creds.PGDATABASE}"
engine = sqlalchemy_package.create_engine(database_url,
                connect_args={'options': '-csearch_path={}'.format(creds.DBSCHEMA)})

In [26]:
# Confirm tables
engine.table_names()

['staffing',
 'ca_hospitals',
 'hospitals_geometry',
 'us_hospitals',
 'hospitals_beds',
 'hospitals_location',
 'hospitals_info']

### Load DataFrames into database

In [None]:
# Load US hospitals list
hospitals_pd.to_sql(name='us_hospitals', con=engine, if_exists='append', index=True)

In [None]:
# Load US hospitals beds
hospitals_beds_pd.to_sql(name='hospitals_beds', con=engine, if_exists='append', index=True)

In [None]:
# Load US hospitals location
hospitals_location_pd.to_sql(name='hospitals_location', con=engine, if_exists='append', index=True)

In [13]:
# Load US hospitals geometry
hospitals_geometry_pd.to_sql(name='hospitals_geometry', con=engine, if_exists='append', index=True)

### Initial Query Analysis

In [14]:
def load_data(schema='public', table='us_hospitals', query=None):
    if query == None:
        sql_command = "SELECT * FROM {}.{};".format(str(schema), str(table))
    else: sql_command = query
    print ('Query:',sql_command)
    # Load the data
    data = pd.read_sql(sql_command, conn)
    print('Data shape:',data.shape)
    return (data)

In [15]:
# Set up a connection to the postgres server via database driver
conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.GROUPUSER \
+" password="+ creds.GROUPPASSWORD
conn=psycopg2.connect(conn_string)

In [17]:
# Create hospitals request for California
hospitals_request = """
    SELECT h.id, h.hospital_name, l.state_name, l.county_name, l.zip_code, l.hospital_type, b.licensed_beds, g.lat, g.lng
    FROM us_hospitals AS h
    INNER JOIN hospitals_beds AS b USING (id)
    INNER JOIN hospitals_location AS l USING (id)
    INNER JOIN hospitals_geometry AS g USING (id)
    WHERE l.state_name = 'California'
    ORDER BY b.licensed_beds DESC;
    """
hosp_request_result = load_data(query=hospitals_request)
hosp_request_result.set_index("id", inplace=True)
hosp_request_result = hosp_request_result.sort_values('hospital_name')
hosp_request_result

Query: 
    SELECT h.id, h.hospital_name, l.state_name, l.county_name, l.zip_code, l.hospital_type, b.licensed_beds, g.lat, g.lng
    FROM us_hospitals AS h
    INNER JOIN hospitals_beds AS b USING (id)
    INNER JOIN hospitals_location AS l USING (id)
    INNER JOIN hospitals_geometry AS g USING (id)
    WHERE l.state_name = 'California'
    ORDER BY b.licensed_beds DESC;
    
Data shape: (322, 9)


Unnamed: 0_level_0,hospital_name,state_name,county_name,zip_code,hospital_type,licensed_beds,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
417-06059,AHMC Anaheim Regional Medical Center,California,Orange,92801,Short Term Acute Care Hospital,223.0,33.848175,-117.934526
313-06029,Adventist Health Bakersfield (FKA San Joaquin ...,California,Kern,93301,Short Term Acute Care Hospital,254.0,35.383374,-119.020654
316-06033,Adventist Health Clear Lake (FKA St Helena Hos...,California,Lake,95422,Critical Access Hospital,32.0,38.935500,-122.620100
349-06037,Adventist Health Glendale (FKA Glendale Advent...,California,Los Angeles,91206,Short Term Acute Care Hospital,415.0,34.150214,-118.230459
315-06031,Adventist Health Hanford,California,Kings,93230,Short Term Acute Care Hospital,173.0,36.323787,-119.666447
...,...,...,...,...,...,...,...,...
370-06037,West Los Angeles Medical Center,California,Los Angeles,90034,Short Term Acute Care Hospital,265.0,34.038347,-118.375900
372-06037,Whittier Hospital Medical Center,California,Los Angeles,90605,Short Term Acute Care Hospital,156.0,33.949822,-118.003330
391-06037,Woodland Hills Medical Center,California,Los Angeles,91367,Short Term Acute Care Hospital,274.0,34.170385,-118.589936
584-06113,Woodland Memorial Hospital,California,Yolo,95695,Short Term Acute Care Hospital,74.0,38.664090,-121.792091


### Adding New Datasets for Analysis (Secondary ETL Process)
* Tables: ca_hospitals, staffing

In [18]:
# Create hospitals request for California
staff_request = """
    SELECT c.hospital_name, c.county, c.zip_code, c.lat, c.lng, s.year, s.staff_group
    FROM ca_hospitals AS c
    INNER JOIN staffing AS s USING (hospital_id)
    WHERE s.year = 2012
    ORDER BY c.hospital_name ASC;
    """
staff_request_result = load_data(query=staff_request)
staff_request_result

Query: 
    SELECT c.hospital_name, c.county, c.zip_code, c.lat, c.lng, s.year, s.staff_group
    FROM ca_hospitals AS c
    INNER JOIN staffing AS s USING (hospital_id)
    WHERE s.year = 2012
    ORDER BY c.hospital_name ASC;
    
Data shape: (7531, 7)


Unnamed: 0,hospital_name,county,zip_code,lat,lng,year,staff_group
0,ADVENTIST HEALTH AND RIDEOUT,YUBA,95901,39.138805,-121.593602,2012,Fiscal Services Cost Centers
1,ADVENTIST HEALTH AND RIDEOUT,YUBA,95901,39.138805,-121.593602,2012,Administrative Services Cost Centers
2,ADVENTIST HEALTH AND RIDEOUT,YUBA,95901,39.138805,-121.593602,2012,Technician & Specialist
3,ADVENTIST HEALTH AND RIDEOUT,YUBA,95901,39.138805,-121.593602,2012,Registered Nurse
4,ADVENTIST HEALTH AND RIDEOUT,YUBA,95901,39.138805,-121.593602,2012,Licensed Vocational Nurse
...,...,...,...,...,...,...,...
7526,WOODLAND MEMORIAL HOSPITAL,YOLO,95695,38.662840,-121.793300,2012,Daily Cost Centers
7527,WOODLAND MEMORIAL HOSPITAL,YOLO,95695,38.662840,-121.793300,2012,Ambulatory Cost Centers
7528,WOODLAND MEMORIAL HOSPITAL,YOLO,95695,38.662840,-121.793300,2012,Ancillary Cost Centers
7529,WOODLAND MEMORIAL HOSPITAL,YOLO,95695,38.662840,-121.793300,2012,Education Cost Centers


#### Normalization
* NPI (National Provider Identifier) - unique identification number for health care providers
* Hospital Name
* Composite key: Zip code + address

In [19]:
# Compare Hospitals Names Lists for California
esri_list = hosp_request_result['hospital_name'].unique().tolist()
staff_list = staff_request_result['hospital_name'].unique().tolist()
esri_upper_list = []
[esri_upper_list.append(i.upper()) for i in esri_list]
hopsitals_list = []
[hopsitals_list.append(name)for name in staff_list if name in esri_upper_list]
len(hopsitals_list)

138

In [20]:
# Compare Zip Code Lists for California
esri_zip_list = hosp_request_result['zip_code'].unique().tolist()
staff_zip_list = staff_request_result['zip_code'].unique().tolist()
zip_code_list = []
[zip_code_list.append(name)for name in staff_zip_list if name in esri_zip_list]
len(zip_code_list)

266

In [21]:
# Compare Latitude Lists for California
esri_lat_list = hosp_request_result['lat'].unique().tolist()
staff_lat_list = staff_request_result['lat'].unique().tolist()
lat_list = []
[lat_list.append(name)for name in staff_lat_list if name in esri_lat_list]
len(lat_list)

0

In [25]:
staff_lat_list[:5]

[39.138805, 35.383064, 38.93619, 35.76143, 34.14951]

In [24]:
esri_lat_list[:5]

[33.84817510000005,
 35.38337440000004,
 38.93550000000005,
 34.15021390000004,
 36.32378720000003]

In [None]:
# Read Hospitals General Information
hospitals_info = pd.read_csv('Resources/Hospital_General_Information.csv')
hospitals_info = hospitals_info.filter(['Facility ID', 'Facility Name', 'State', 'ZIP Code', 'Hospital Ownership'])
hospitals_info = hospitals_info.rename(columns={
                                        'Facility ID': 'facility_id', 'Facility Name': 'hospital_name',
                                        'State':'state_name', 'ZIP Code': 'zip_code', 'Hospital Ownership': 'ownership'})
hospitals_info.set_index("facility_id", inplace=True)
hospitals_info = hospitals_info.sort_values('hospital_name')
# hospitals_info.head()

In [None]:
# Read Hospitals Profitability Information
hospitals_finance = pd.read_csv('Resources/hospital-profitability-2009-2013-.csv')
hospitals_finance = hospitals_finance.filter(['Facility Number', 'Facility Name', 'Year', 'County Name',
                                              'Income Statement Amount', 'Amount per Adjusted Patient Day'])
hospitals_finance = hospitals_finance.rename(columns={
                                        'Facility Number': 'facility_num', 'Facility Name': 'hospital_name',
                                        'Year': 'year', 'County Name':'county_name', 
                                        'Income Statement Amount': 'total_income',
                                        'Amount per Adjusted Patient Day': 'adj_patient_income'})
hospitals_finance.set_index("facility_num", inplace=True)
hospitals_finance = hospitals_finance.sort_values('hospital_name')
# hospitals_finance

In [None]:
# Read Hospitals Insuarance Info
hospitals_insuarance = pd.read_csv('Resources/healthcare_census.csv')
hospitals_insuarance = hospitals_insuarance.filter(['Provider Id', 'Provider Name', 'Provider Zip Code',
                                              'Average Covered Charges', 'Average Total Payments'])
hospitals_insuarance = hospitals_insuarance.rename(columns={
                                        'Provider Id': 'provider_id', 'Provider Name': 'hospital_name',
                                        'Provider Zip Code': 'zip_code', 'Average Covered Charges': 'avg_cov_charges',
                                        'Average Total Payments': 'avg_tot_payments'})
hospitals_insuarance.set_index("provider_id", inplace=True)
hospitals_insuarance = hospitals_insuarance.sort_values('hospital_name')
# hospitals_insuarance.head()

* Overall, normalization requires more time and resources
* Keep looking for alternative solutions

#### Database Updating Grop Work

In [None]:
# Load US hospitals info
# hospitals_info.to_sql(name='hospitals_info', con=engine, if_exists='append', index=True)

In [None]:
# Load US hospitals_staff
# hospitals_insuarance.to_sql(name='hospitals_insuarance', con=engine, if_exists='append', index=True)