In [1]:
import pandas as pd
from sqlalchemy import create_engine

## Staffing dataset

The source has contained 2 files for data. We quickly compare the files and therewere no differences identified. The cleaning of the data was performed therefore only on staffing_file.  

In [2]:
# Extract cvs into DataFrame
staffing_file = "Resources/hospital-staffing-2009-2013-.csv"
staffing_file2 = "Resources/hospital-staffing-2009-2013-2.csv"
staffing_df = pd.read_csv(staffing_file)
staffing_df2 = pd.read_csv(staffing_file2)

In [3]:
staffing_df.head()

Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
0,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Management & Supervision,63558,1.17
1,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Technician & Specialist,163706,3.02
2,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Registered Nurse,180034,3.32
3,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Licensed Vocational Nurse,22323,0.41
4,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Aides & Orderlies,97205,1.79


In [4]:
staffing_df2.head()

Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
0,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Management & Supervision,63558,1.17
1,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Technician & Specialist,163706,3.02
2,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Registered Nurse,180034,3.32
3,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Licensed Vocational Nurse,22323,0.41
4,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Aides & Orderlies,97205,1.79


In [5]:
# COMPARE BOTH DATASETS TO IDENTIFY IF THEY ARE SAME OR DIFFERENT:
# resource: https://stackoverflow.com/questions/20225110/comparing-two-dataframes-and-getting-the-differences/52047609#52047609
#identify if the columns heading are equal
if staffing_df.columns.all() == staffing_df2.columns.all():
    print('columns in dfs are same')
else:
    print('columns in dfs differ')
#Compare dfs: concat dfs and drop all duplicates to get the df of differences
df_diff = pd.concat([staffing_df,staffing_df2]).drop_duplicates(keep=False)
print(f'number of different rows in dfs: {len(df_diff)}')
df_diff
#As the datasest are identical, we use further the staffing_df to work with. We can drop staffing_df2

columns in dfs are same
number of different rows in dfs: 0


Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day


### Cleaning of staffing datas

In [6]:
#1. CHECK FOR DUPLICATED ROWS/DATA IN DF: No identified
print(f"Number if duplicated rows in df: {len(staffing_df[staffing_df.duplicated(keep='first') == True])}")

staffing_df[staffing_df.duplicated(keep='first') == True]

Number if duplicated rows in df: 0


Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day


In [7]:
# 2. NaN/NA VALUES 
#check the df for datatype and NaN values
staffing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37604 entries, 0 to 37603
Data columns (total 10 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Year                                       37604 non-null  int64  
 1   Facility Number                            37519 non-null  float64
 2   Facility Name                              37519 non-null  object 
 3   Begin Date                                 37519 non-null  object 
 4   End Date                                   37519 non-null  object 
 5   County Name                                37604 non-null  object 
 6   Type of Control                            37519 non-null  object 
 7   Hours Type                                 37604 non-null  object 
 8   Productive Hours                           37604 non-null  int64  
 9   Productive Hours per Adjusted Patient Day  37417 non-null  float64
dtypes: float64(2), int64(2

In [8]:
# 3. DROP ALL ROWS WHERE WITH NULL VALUE FOR 'FACILITY NUMBER':
# Based on the review of rows that includes NaN values we identified that all rows with NaN Facility Number are marked as county = statewide
# and therefore not required.

# - identify all rows with N/A data and review it
staffing_df[staffing_df.isnull().any(axis=1)]
# - drop all rows with Facility nUmber as NaN
staffing_df.dropna(subset=['Facility Number'], inplace=True)
staffing_df

Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
0,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Management & Supervision,63558,1.17
1,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Technician & Specialist,163706,3.02
2,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Registered Nurse,180034,3.32
3,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Licensed Vocational Nurse,22323,0.41
4,2009,106010735.0,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Aides & Orderlies,97205,1.79
...,...,...,...,...,...,...,...,...,...,...
37582,2013,106580996.0,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Ancillary Cost Centers,769308,9.46
37583,2013,106580996.0,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Education Cost Centers,0,0.00
37584,2013,106580996.0,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,General Services Cost Centers,250731,3.08
37585,2013,106580996.0,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Fiscal Services Cost Centers,152364,1.87


In [9]:
staffing_df.count()

Year                                         37519
Facility Number                              37519
Facility Name                                37519
Begin Date                                   37519
End Date                                     37519
County Name                                  37519
Type of Control                              37519
Hours Type                                   37519
Productive Hours                             37519
Productive Hours per Adjusted Patient Day    37332
dtype: int64

In [10]:
#4 REPLACE NaN VALUES IN 'Productive Hours per Adjusted Patient Day' with zero:
staffing_df[staffing_df.isnull().any(axis=1)].count()

Year                                         187
Facility Number                              187
Facility Name                                187
Begin Date                                   187
End Date                                     187
County Name                                  187
Type of Control                              187
Hours Type                                   187
Productive Hours                             187
Productive Hours per Adjusted Patient Day      0
dtype: int64

In [11]:
staffing_df.fillna(0, inplace=True)
staffing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37519 entries, 0 to 37586
Data columns (total 10 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Year                                       37519 non-null  int64  
 1   Facility Number                            37519 non-null  float64
 2   Facility Name                              37519 non-null  object 
 3   Begin Date                                 37519 non-null  object 
 4   End Date                                   37519 non-null  object 
 5   County Name                                37519 non-null  object 
 6   Type of Control                            37519 non-null  object 
 7   Hours Type                                 37519 non-null  object 
 8   Productive Hours                           37519 non-null  int64  
 9   Productive Hours per Adjusted Patient Day  37519 non-null  float64
dtypes: float64(2), int64(2

In [12]:
# 5 CHECK ON DATA CONSISTENCY/UNIQUE VALUES 
# check the counts of unique values per columns - facility number and facility name have different number of unique values
# the further analysis is needed
staffing_df.nunique()

Year                                             5
Facility Number                                464
Facility Name                                  484
Begin Date                                      60
End Date                                        64
County Name                                     56
Type of Control                                  5
Hours Type                                      17
Productive Hours                             29085
Productive Hours per Adjusted Patient Day     1970
dtype: int64

In [13]:
num_name_check = staffing_df.copy()
num_name_check.drop_duplicates(subset=['Facility Number', 'Facility Name'], inplace=True)
num_name_issues = num_name_check[num_name_check['Facility Number'].duplicated(keep=False)].sort_values(by='Facility Number')
num_name_issues
# The name column will be dropped, as we unified naming from 'hospital_bridge' df can be used in database

Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
136,2009,106010887.0,KINDRED HOSPITAL - SAN FRANCISCO BAY AREA,01/01/2009,12/31/2009,Alameda,Investor,Management & Supervision,37185,1.9
22814,2012,106010887.0,KINDRED HOSPITAL SAN FRANCISCO BAY AREA,01/01/2012,12/31/2012,Alameda,Investor,Management & Supervision,51909,3.18
734,2009,106100797.0,SIERRA KINGS DISTRICT HOSPITAL,07/01/2008,06/30/2009,Fresno,District,Management & Supervision,66727,4.51
30957,2013,106100797.0,ADVENTIST MEDICAL CENTER - REEDLEY,01/01/2013,12/31/2013,Fresno,Non-Profit,Management & Supervision,23891,1.38
1453,2009,106190053.0,ST. MARY MEDICAL CENTER,07/01/2008,06/30/2009,Los Angeles,Non-Profit,Management & Supervision,155698,1.92
24140,2012,106190053.0,ST. MARY MEDICAL CENTER - LONG BEACH,07/01/2011,06/30/2012,Los Angeles,Non-Profit,Management & Supervision,133792,1.87
9089,2010,106190053.0,ST. MARY MEDICAL CENTER - LOS ANGELES,07/01/2009,06/30/2010,Los Angeles,Non-Profit,Management & Supervision,142415,1.8
2272,2009,106190517.0,TARZANA MEDICAL CENTER,01/01/2009,12/31/2009,Los Angeles,Investor,Management & Supervision,136159,1.75
9888,2010,106190517.0,PROVIDENCE TARZANA MEDICAL CENTER,01/01/2010,12/31/2010,Los Angeles,Non-Profit,Management & Supervision,128766,1.63
10228,2010,106190687.0,SANTA MONICA-UCLA MEDICAL CENTER & ORTHOPAEDIC...,07/01/2009,06/30/2010,Los Angeles,Non-Profit,Management & Supervision,251646,2.61


In [14]:
#6. DATA TYPES CORRECTION:
# - facility number should be int64
staffing_df['Facility Number']=staffing_df['Facility Number'].astype('int64')
staffing_df.dtypes

Year                                           int64
Facility Number                                int64
Facility Name                                 object
Begin Date                                    object
End Date                                      object
County Name                                   object
Type of Control                               object
Hours Type                                    object
Productive Hours                               int64
Productive Hours per Adjusted Patient Day    float64
dtype: object

In [15]:
#7. RESET THE INDEX
staffing_df.reset_index(drop=True, inplace=True)
staffing_df

Unnamed: 0,Year,Facility Number,Facility Name,Begin Date,End Date,County Name,Type of Control,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
0,2009,106010735,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Management & Supervision,63558,1.17
1,2009,106010735,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Technician & Specialist,163706,3.02
2,2009,106010735,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Registered Nurse,180034,3.32
3,2009,106010735,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Licensed Vocational Nurse,22323,0.41
4,2009,106010735,ALAMEDA HOSPITAL,07/01/2008,06/30/2009,Alameda,District,Aides & Orderlies,97205,1.79
...,...,...,...,...,...,...,...,...,...,...
37514,2013,106580996,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Ancillary Cost Centers,769308,9.46
37515,2013,106580996,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Education Cost Centers,0,0.00
37516,2013,106580996,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,General Services Cost Centers,250731,3.08
37517,2013,106580996,RIDEOUT MEMORIAL HOSPITAL,07/01/2012,06/30/2013,Yuba,Non-Profit,Fiscal Services Cost Centers,152364,1.87


In [16]:
#8. select columns for final upload file
staffing_df_for_import = staffing_df[['Facility Number', 'Year','Hours Type', 'Productive Hours', 
                           'Productive Hours per Adjusted Patient Day' ]]
staffing_df_for_import.head()

Unnamed: 0,Facility Number,Year,Hours Type,Productive Hours,Productive Hours per Adjusted Patient Day
0,106010735,2009,Management & Supervision,63558,1.17
1,106010735,2009,Technician & Specialist,163706,3.02
2,106010735,2009,Registered Nurse,180034,3.32
3,106010735,2009,Licensed Vocational Nurse,22323,0.41
4,106010735,2009,Aides & Orderlies,97205,1.79


In [17]:
#9.Rename columns:
staffing_df_for_import = staffing_df_for_import.rename(columns={'Facility Number':'hospital_id',
                                          'Year': 'year',
                                          'Hours Type': 'staff_group',
                                          'Productive Hours': 'prod_hrs',
                                          'Productive Hours per Adjusted Patient Day': 'prodhrs_adjptday'
                                           })
# reset index to hospital_id
staffing_df_for_import.head()

Unnamed: 0,hospital_id,year,staff_group,prod_hrs,prodhrs_adjptday
0,106010735,2009,Management & Supervision,63558,1.17
1,106010735,2009,Technician & Specialist,163706,3.02
2,106010735,2009,Registered Nurse,180034,3.32
3,106010735,2009,Licensed Vocational Nurse,22323,0.41
4,106010735,2009,Aides & Orderlies,97205,1.79


In [18]:
staffing_df_for_import.nunique()

hospital_id           464
year                    5
staff_group            17
prod_hrs            29085
prodhrs_adjptday     1970
dtype: int64

## California facilities listing

In order to get additional data for hospitals, such as location we used Current Healthcare Facility Listing from CHHS California that includes all healthcare facilities. After filtering only hospitals we performed cleaning of the data:
File source: 

https://data.chhs.ca.gov/dataset/licensed-healthcare-facility-listing/resource/641c5557-7d65-4379-8fea-6b7dedbda40b

In [19]:
# upload data
listing_file = "Resources/current-healthcare-facility-listing.csv"
ca_listings_df = pd.read_csv(listing_file)

In [20]:
ca_listings_df.head()

Unnamed: 0,OSHPD_ID,FACILITY_NAME,LICENSE_NUM,FACILITY_LEVEL_DESC,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,COUNTY_CODE,COUNTY_NAME,ER_SERVICE_LEVEL_DESC,TOTAL_NUMBER_BEDS,FACILITY_STATUS_DESC,FACILITY_STATUS_DATE,LICENSE_TYPE_DESC,LICENSE_CATEGORY_DESC,LATITUDE,LONGITUDE
0,106010735,ALAMEDA HOSPITAL,140000002,Parent Facility,2070 Clinton Ave,Alameda,94501,1,Alameda,Emergency - Basic,101,Open,1946-01-01,Hospital,General Acute Care Hospital,37.76266,-122.253991
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,140000004,Parent Facility,2450 Ashby Ave,Berkeley,94705,1,Alameda,Emergency - Basic,339,Open,1946-01-02,Hospital,General Acute Care Hospital,37.85645,-122.25743
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,140000015,Parent Facility,747 52ND ST,OAKLAND,94609,1,Alameda,Emergency - Basic,223,Open,1946-01-01,Hospital,General Acute Care Hospital,37.83722,-122.26747
3,106010811,FAIRMONT HOSPITAL,140000046,Consolidated Facility,15400 Foothill Blvd,San Leandro,94578,1,Alameda,,109,Open,1953-07-01,Hospital,General Acute Care Hospital,37.70648,-122.11819
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,140000004,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,1,Alameda,,68,Open,1946-01-01,Hospital,General Acute Care Hospital,37.86373,-122.26984


In [21]:
# select only hospitals from the dataset:
ca_listings_df=ca_listings_df[ca_listings_df.LICENSE_TYPE_DESC =='Hospital']
ca_listings_df.nunique()

OSHPD_ID                 527
FACILITY_NAME            525
LICENSE_NUM              440
FACILITY_LEVEL_DESC        3
DBA_ADDRESS1             523
DBA_CITY                 323
DBA_ZIP_CODE             374
COUNTY_CODE               57
COUNTY_NAME               57
ER_SERVICE_LEVEL_DESC      5
TOTAL_NUMBER_BEDS        276
FACILITY_STATUS_DESC       2
FACILITY_STATUS_DATE     440
LICENSE_TYPE_DESC          1
LICENSE_CATEGORY_DESC      4
LATITUDE                 522
LONGITUDE                523
dtype: int64

In [22]:
# select required columns
ca_listings_df = ca_listings_df[['OSHPD_ID', 'FACILITY_NAME', 'FACILITY_STATUS_DESC', 'FACILITY_LEVEL_DESC','DBA_ADDRESS1', 'DBA_CITY', 'DBA_ZIP_CODE',
                                   'COUNTY_NAME', 'LICENSE_CATEGORY_DESC','LONGITUDE', 'LATITUDE']]
ca_listings_df.head()

Unnamed: 0,OSHPD_ID,FACILITY_NAME,FACILITY_STATUS_DESC,FACILITY_LEVEL_DESC,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,COUNTY_NAME,LICENSE_CATEGORY_DESC,LONGITUDE,LATITUDE
0,106010735,ALAMEDA HOSPITAL,Open,Parent Facility,2070 Clinton Ave,Alameda,94501,Alameda,General Acute Care Hospital,-122.253991,37.76266
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,Open,Parent Facility,2450 Ashby Ave,Berkeley,94705,Alameda,General Acute Care Hospital,-122.25743,37.85645
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,Open,Parent Facility,747 52ND ST,OAKLAND,94609,Alameda,General Acute Care Hospital,-122.26747,37.83722
3,106010811,FAIRMONT HOSPITAL,Open,Consolidated Facility,15400 Foothill Blvd,San Leandro,94578,Alameda,General Acute Care Hospital,-122.11819,37.70648
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,Open,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,Alameda,General Acute Care Hospital,-122.26984,37.86373


In [23]:
# drop duplicated rows if any
ca_listings_df.drop_duplicates(inplace=True)

In [24]:
# call info on df
ca_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527 entries, 0 to 1875
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   OSHPD_ID               527 non-null    int64  
 1   FACILITY_NAME          527 non-null    object 
 2   FACILITY_STATUS_DESC   527 non-null    object 
 3   FACILITY_LEVEL_DESC    527 non-null    object 
 4   DBA_ADDRESS1           527 non-null    object 
 5   DBA_CITY               527 non-null    object 
 6   DBA_ZIP_CODE           527 non-null    int64  
 7   COUNTY_NAME            527 non-null    object 
 8   LICENSE_CATEGORY_DESC  527 non-null    object 
 9   LONGITUDE              527 non-null    float64
 10  LATITUDE               527 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 49.4+ KB


In [25]:
# inspect N/A values
ca_listings_df[ca_listings_df.isnull().any(axis=1)]

Unnamed: 0,OSHPD_ID,FACILITY_NAME,FACILITY_STATUS_DESC,FACILITY_LEVEL_DESC,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,COUNTY_NAME,LICENSE_CATEGORY_DESC,LONGITUDE,LATITUDE


In [26]:
ca_listings_df.columns

Index(['OSHPD_ID', 'FACILITY_NAME', 'FACILITY_STATUS_DESC',
       'FACILITY_LEVEL_DESC', 'DBA_ADDRESS1', 'DBA_CITY', 'DBA_ZIP_CODE',
       'COUNTY_NAME', 'LICENSE_CATEGORY_DESC', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

In [27]:
# rename columns:
ca_listings_df = ca_listings_df.rename(columns={'OSHPD_ID':'hospital_id',
                                                    'FACILITY_NAME': 'hospital_name',
                                                    'FACILITY_STATUS_DESC': 'status',
                                                    'FACILITY_LEVEL_DESC': 'fac_level',
                                                    'DBA_ADDRESS1': 'address',
                                                    'DBA_CITY': 'city',
                                                    'DBA_ZIP_CODE': 'zip_code',
                                                    'COUNTY_NAME': 'county',
                                                    'LICENSE_CATEGORY_DESC': 'category',
                                                    'LONGITUDE': 'lng',
                                                    'LATITUDE': 'lat'})
ca_listings_df.head()


Unnamed: 0,hospital_id,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat
0,106010735,ALAMEDA HOSPITAL,Open,Parent Facility,2070 Clinton Ave,Alameda,94501,Alameda,General Acute Care Hospital,-122.253991,37.76266
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,Open,Parent Facility,2450 Ashby Ave,Berkeley,94705,Alameda,General Acute Care Hospital,-122.25743,37.85645
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,Open,Parent Facility,747 52ND ST,OAKLAND,94609,Alameda,General Acute Care Hospital,-122.26747,37.83722
3,106010811,FAIRMONT HOSPITAL,Open,Consolidated Facility,15400 Foothill Blvd,San Leandro,94578,Alameda,General Acute Care Hospital,-122.11819,37.70648
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,Open,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,Alameda,General Acute Care Hospital,-122.26984,37.86373


In [28]:
# check on duplicated addresses - different hospitals on the same address, no issue identify
ca_listings_df[ca_listings_df.duplicated(subset=['address'], keep = False)]

Unnamed: 0,hospital_id,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat
55,106104089,EXODUS PSYCHIATRIC HEALTH FACILITY FRESNO,Open,Parent Facility,4411 E Kings Canyon Rd,Fresno,93702,Fresno,Psychiatric Health Facility,-119.75162,36.73596
58,106105125,CENTRAL STAR PSYCHIATRIC HEALTH FACILITY,Open,Parent Facility,4411 E Kings Canyon Rd,Fresno,93702,Fresno,Psychiatric Health Facility,-119.75162,36.73596
246,106301262,MISSION HOSPITAL REGIONAL MEDICAL CENTER,Open,Parent Facility,27700 MEDICAL CENTER RD,MISSION VIEJO,92691,Orange,General Acute Care Hospital,-117.66768,33.56279
263,106304113,CHILDREN'S HOSPITAL AT MISSION,Open,Parent Facility,27700 MEDICAL CENTER RD,MISSION VIEJO,92691,Orange,General Acute Care Hospital,-117.66768,33.56279
276,106330120,THE BETTY FORD CENTER,Open,Parent Facility,39000 Bob Hope Dr,Rancho Mirage,92270,Riverside,Chemical Dep. Recovery Hospital,-116.40808,33.76418
280,106331168,EISENHOWER MEDICAL CENTER,Open,Parent Facility,39000 Bob Hope Dr,Rancho Mirage,92270,Riverside,General Acute Care Hospital,-116.40808,33.76418
500,106560501,OJAI VALLEY COMMUNITY HOSPITAL,Open,Parent Facility,1306 MARICOPA HWY,OJAI,93023,Ventura,General Acute Care Hospital,-119.26311,34.44221
1875,206560500,OJAI VALLEY COMMUNITY SKILLED NURSING FACILITY,Open,Distinct Part Facility,1306 MARICOPA HWY,OJAI,93023,Ventura,General Acute Care Hospital,-119.26311,34.44221


# Comparison of Hospital_id between the files

The Current Healthcare Facility Listing is updated as of August 3, 2020. The Staffing data are for years 2009-2013. To be sure that the facility listing includes all ids that were active in 2009-2013 period (changes can be due to the closing of the hospital or relocatipn) we compared the list of ids and included all the missing one to the ca_listing with all the data we have currently available. For missing data we decided to use 'missing data' or zero values. 

In [29]:
# compare the list of ids and identify the list of missing ones in the ca_listings:
staffing_ids = list(staffing_df_for_import['hospital_id'].unique())
listing_ids = list(ca_listings_df['hospital_id'].unique())

diff_ids = list(set(staffing_ids)-set(listing_ids))
diff_ids
print(f'There are {len(diff_ids)} hospital_ids in staffing_df that are not included in hospital_bridge df')

There are 42 hospital_ids in staffing_df that are not included in hospital_bridge df


In [30]:
diff_ids

[106380929,
 106491267,
 106190468,
 106190854,
 106100745,
 106361105,
 106370705,
 106160787,
 106190712,
 106551061,
 106015000,
 106150808,
 106010782,
 106154147,
 106380964,
 106431013,
 106191014,
 106490919,
 106230949,
 106304426,
 106040875,
 106190762,
 106410804,
 106010805,
 106484028,
 106160702,
 106190784,
 106191300,
 106374084,
 106560838,
 106301132,
 106281297,
 106301781,
 106160725,
 106190430,
 106190307,
 106370787,
 106010856,
 106010858,
 106514037,
 106190455,
 106070904]

In [31]:
# create a temporary df for the missing id with data available from staffing file that can be used in ca_listings df
columns_name = staffing_df.columns
missing_ids_df = pd.DataFrame(columns = columns_name)
for i in diff_ids:
    missing_ids_df = missing_ids_df.append(staffing_df.loc[staffing_df['Facility Number'] == i])
print(missing_ids_df.nunique())
missing_ids_df.drop(columns=['Year', 'Begin Date', 'End Date', 'Type of Control', 'Hours Type', 'Productive Hours', 
                     'Productive Hours per Adjusted Patient Day'], inplace=True)
missing_ids_df.drop_duplicates(subset=['Facility Number'], keep='last', inplace = True)
missing_ids_df = missing_ids_df.rename(columns={'Facility Number':'hospital_id',
                                                'Facility Name': 'hospital_name',
                                                'County Name': 'county'})
missing_ids_df['hospital_name'] = missing_ids_df['hospital_name'].str.upper()
missing_ids_df['county'] = missing_ids_df['county'].str.upper()
missing_ids_df.reset_index(drop=True, inplace=True)
missing_ids_df

Year                                            5
Facility Number                                42
Facility Name                                  42
Begin Date                                     14
End Date                                       19
County Name                                    20
Type of Control                                 5
Hours Type                                     17
Productive Hours                             1884
Productive Hours per Adjusted Patient Day     808
dtype: int64


Unnamed: 0,hospital_id,hospital_name,county
0,106380929,CALIFORNIA PACIFIC MEDICAL CENTER,SAN FRANCISCO
1,106491267,SONOMA DEVELOPMENTAL CENTER,SONOMA
2,106190468,PROMISE HOSPITAL OF EAST LOS ANGELES,LOS ANGELES
3,106190854,LOS ANGELES METROPOLITAN MEDICAL CENTER,LOS ANGELES
4,106100745,KINGSBURG MEDICAL HOSPITAL,FRESNO
5,106361105,BARSTOW COMMUNITY HOSPITAL,SAN BERNARDINO
6,106370705,FALLBROOK HOSPITAL DISTRICT,SAN DIEGO
7,106160787,CENTRAL VALLEY GENERAL HOSPITAL,KINGS
8,106190712,SHRINERS HOSPITAL FOR CHILDREN- LOS ANGELES,LOS ANGELES
9,106551061,TUOLUMNE GENERAL MEDICAL FACILITY,TUOLUMNE


In [32]:
# merge missing ids data to final ca_listing df:
ca_listings_final=pd.concat([ca_listings_df, missing_ids_df])
print(ca_listings_final.info())
# check for duplicates
print(ca_listings_final.nunique())
# replace Na values:
values = {'status':'inactive',
          'fac_level':'missing data', 
          'address':'missing data', 
          'city': 'missing data',
          'zip_code':0,
          'category':'missing data',
          'lng':0,
          'lat':0}
ca_listings_final.fillna(value=values, inplace=True)
# # apply uppercase to columns: 'hospital_name', 'address', 'city', 'county':
# source: https://www.geeksforgeeks.org/apply-uppercase-to-a-column-in-pandas-dataframe/
ca_listings_final['hospital_name'] = ca_listings_final['hospital_name'].str.upper()
ca_listings_final['address'] = ca_listings_final['address'].str.upper()
ca_listings_final['city'] = ca_listings_final['city'].str.upper()
ca_listings_final['county'] = ca_listings_final['county'].str.upper()

# add facility num as new column and extract it from "hospital_id":
ca_listings_final['facility_num'] = ca_listings_final['hospital_id'].astype(str).str[-6:].astype('int64')

# correct the type of data in columns:
ca_listings_final['hospital_id']=ca_listings_final['hospital_id'].astype('int64')
ca_listings_final['zip_code']=ca_listings_final['zip_code'].astype('int64')

print(ca_listings_final.info())
print(ca_listings_final.nunique())
ca_listings_final

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 0 to 41
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   hospital_id    569 non-null    object 
 1   hospital_name  569 non-null    object 
 2   status         527 non-null    object 
 3   fac_level      527 non-null    object 
 4   address        527 non-null    object 
 5   city           527 non-null    object 
 6   zip_code       527 non-null    float64
 7   county         569 non-null    object 
 8   category       527 non-null    object 
 9   lng            527 non-null    float64
 10  lat            527 non-null    float64
dtypes: float64(3), object(8)
memory usage: 53.3+ KB
None
hospital_id      569
hospital_name    564
status             2
fac_level          3
address          523
city             323
zip_code         374
county            77
category           4
lng              523
lat              522
dtype: int64
<class 'pandas.core.fr

Unnamed: 0,hospital_id,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat,facility_num
0,106010735,ALAMEDA HOSPITAL,Open,Parent Facility,2070 CLINTON AVE,ALAMEDA,94501,ALAMEDA,General Acute Care Hospital,-122.253991,37.76266,10735
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,Open,Parent Facility,2450 ASHBY AVE,BERKELEY,94705,ALAMEDA,General Acute Care Hospital,-122.257430,37.85645,10739
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,Open,Parent Facility,747 52ND ST,OAKLAND,94609,ALAMEDA,General Acute Care Hospital,-122.267470,37.83722,10776
3,106010811,FAIRMONT HOSPITAL,Open,Consolidated Facility,15400 FOOTHILL BLVD,SAN LEANDRO,94578,ALAMEDA,General Acute Care Hospital,-122.118190,37.70648,10811
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,Open,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,ALAMEDA,General Acute Care Hospital,-122.269840,37.86373,10844
...,...,...,...,...,...,...,...,...,...,...,...,...
37,106010856,KAISER FOUNDATION HOSPITAL - OAKLAND CAMPUS,inactive,missing data,MISSING DATA,MISSING DATA,0,ALAMEDA,missing data,0.000000,0.00000,10856
38,106010858,KAISER FOUNDATION HOSPITAL - HAYWARD,inactive,missing data,MISSING DATA,MISSING DATA,0,ALAMEDA,missing data,0.000000,0.00000,10858
39,106514037,SEQUOIA PSYCHIATRIC CENTER,inactive,missing data,MISSING DATA,MISSING DATA,0,SUTTER,missing data,0.000000,0.00000,514037
40,106190455,LANCASTER COMMUNITY HOSPITAL,inactive,missing data,MISSING DATA,MISSING DATA,0,LOS ANGELES,missing data,0.000000,0.00000,190455


In [33]:
# duplicates on hospital name:
ca_listings_final[ca_listings_final.duplicated(subset=['hospital_name'], keep = False)]
#conclusion: seems some have 2 different facility locations, some changed location, no changes to be done

Unnamed: 0,hospital_id,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat,facility_num
19,106014233,EDEN MEDICAL CENTER,Open,Parent Facility,20103 LAKE CHABOT RD,CASTRO VALLEY,94546,ALAMEDA,General Acute Care Hospital,-122.087406,37.698377,14233
343,106364430,BARSTOW COMMUNITY HOSPITAL,Open,Parent Facility,820 E MOUNTAIN VIEW ST,BARSTOW,92311,SAN BERNARDINO,General Acute Care Hospital,-117.016884,34.893526,364430
347,106370652,ALVARADO HOSPITAL MEDICAL CENTER,Open,Parent Facility,6655 ALVARADO RD,SAN DIEGO,92120,SAN DIEGO,General Acute Care Hospital,-117.057319,32.776641,370652
372,106374063,ALVARADO HOSPITAL MEDICAL CENTER,Open,Consolidated Facility,6645 ALVARADO RD,SAN DIEGO,92120,SAN DIEGO,General Acute Care Hospital,-117.05696,32.7771,374063
415,106414139,KAISER FOUNDATION HOSPITAL - REDWOOD CITY,Open,Parent Facility,1100 VETERANS BLVD,REDWOOD CITY,94063,SAN MATEO,General Acute Care Hospital,-122.22463,37.48967,414139
424,106430035,STANFORD HEALTH CARE,Open,Consolidated Facility,500 PASTEUR DR,PALO ALTO,94304,SANTA CLARA,General Acute Care Hospital,-122.17657,37.43384,430035
431,106430905,STANFORD HEALTH CARE,Open,Parent Facility,300 PASTEUR DR,PALO ALTO,94305,SANTA CLARA,General Acute Care Hospital,-122.175881,37.433,430905
5,106361105,BARSTOW COMMUNITY HOSPITAL,inactive,missing data,MISSING DATA,MISSING DATA,0,SAN BERNARDINO,missing data,0.0,0.0,361105
22,106410804,KAISER FOUNDATION HOSPITAL - REDWOOD CITY,inactive,missing data,MISSING DATA,MISSING DATA,0,SAN MATEO,missing data,0.0,0.0,410804
23,106010805,EDEN MEDICAL CENTER,inactive,missing data,MISSING DATA,MISSING DATA,0,ALAMEDA,missing data,0.0,0.0,10805


In [34]:
ca_listings_final.columns

Index(['hospital_id', 'hospital_name', 'status', 'fac_level', 'address',
       'city', 'zip_code', 'county', 'category', 'lng', 'lat', 'facility_num'],
      dtype='object')

In [35]:
ca_listings_final.dtypes

hospital_id        int64
hospital_name     object
status            object
fac_level         object
address           object
city              object
zip_code           int64
county            object
category          object
lng              float64
lat              float64
facility_num       int64
dtype: object

In [36]:
ca_listings_final.set_index('hospital_id',inplace=True)

In [37]:
ca_listings_final.head()

Unnamed: 0_level_0,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat,facility_num
hospital_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
106010735,ALAMEDA HOSPITAL,Open,Parent Facility,2070 CLINTON AVE,ALAMEDA,94501,ALAMEDA,General Acute Care Hospital,-122.253991,37.76266,10735
106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,Open,Parent Facility,2450 ASHBY AVE,BERKELEY,94705,ALAMEDA,General Acute Care Hospital,-122.25743,37.85645,10739
106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,Open,Parent Facility,747 52ND ST,OAKLAND,94609,ALAMEDA,General Acute Care Hospital,-122.26747,37.83722,10776
106010811,FAIRMONT HOSPITAL,Open,Consolidated Facility,15400 FOOTHILL BLVD,SAN LEANDRO,94578,ALAMEDA,General Acute Care Hospital,-122.11819,37.70648,10811
106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,Open,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,ALAMEDA,General Acute Care Hospital,-122.26984,37.86373,10844


## IMPORT DATABASES

In [38]:
import config as creds

# create database connection:
rds_connection_string = f"{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5432/{creds.PGDATABASE}"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [40]:
# Confirm tables
engine.table_names()

['ca_hospitals', 'staffing']

In [41]:
# Load DataFrames into databases
ca_listings_final.to_sql(name='ca_hospitals', con=engine, if_exists='append', index=True)

In [42]:
staffing_df_for_import.to_sql(name='staffing', con=engine, if_exists='append', index=False)

In [44]:
pd.read_sql_query('select * from ca_hospitals', con=engine).head()

Unnamed: 0,hospital_id,hospital_name,status,fac_level,address,city,zip_code,county,category,lng,lat,facility_num
0,106010735,ALAMEDA HOSPITAL,Open,Parent Facility,2070 CLINTON AVE,ALAMEDA,94501,ALAMEDA,General Acute Care Hospital,-122.253991,37.76266,10735
1,106010739,ALTA BATES SUMMIT MEDICAL CENTER-ALTA BATES CA...,Open,Parent Facility,2450 ASHBY AVE,BERKELEY,94705,ALAMEDA,General Acute Care Hospital,-122.25743,37.85645,10739
2,106010776,UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND,Open,Parent Facility,747 52ND ST,OAKLAND,94609,ALAMEDA,General Acute Care Hospital,-122.26747,37.83722,10776
3,106010811,FAIRMONT HOSPITAL,Open,Consolidated Facility,15400 FOOTHILL BLVD,SAN LEANDRO,94578,ALAMEDA,General Acute Care Hospital,-122.11819,37.70648,10811
4,106010844,ALTA BATES SUMMIT MEDICAL CENTER-HERRICK CAMPUS,Open,Consolidated Facility,2001 DWIGHT WAY,BERKELEY,94704,ALAMEDA,General Acute Care Hospital,-122.26984,37.86373,10844


In [45]:
pd.read_sql_query('select * from staffing', con=engine).head()

Unnamed: 0,hospital_id,year,staff_group,prod_hrs,prodhrs_adjptday
0,106010735,2009,Management & Supervision,63558,1.17
1,106010735,2009,Technician & Specialist,163706,3.02
2,106010735,2009,Registered Nurse,180034,3.32
3,106010735,2009,Licensed Vocational Nurse,22323,0.41
4,106010735,2009,Aides & Orderlies,97205,1.79
