# CMS Hospital Quality of Care - Data Cleaning
---

In [1]:
import pandas as pd
import addfips as af
from uszipcode import SearchEngine
from get_ms_la_al import shpsa

In [2]:
# Create a list called files for the string paths to the raw data
files = ['raw_data/2016_Hospital_General_Information.csv', 'raw_data/2017_Hospital_General_Information.csv',
         'raw_data/2018_Hospital_General_Information.csv', 'raw_data/2019_Hospital_General_Information.csv',
         'raw_data/2020_Hospital_General_Information.csv', 'raw_data/2021_Hospital_General_Information.csv',
         'raw_data/2022_Hospital_General_Information.csv', 'raw_data/2023_Hospital_General_Information.csv']

# Create an empty list called dfs to hold the dataframes, then read the .csvs and append to dfs
dfs = []
for file in files:
    dfs.append(pd.read_csv(file, encoding = 'iso-8859-1'))

# Union all dataframes in dfs into one df
df = pd.concat(dfs, ignore_index = True)

# Make copy of df called cms; cms will be transformed
cms = df.copy()

#### Drop unnecessary columns

In [3]:
# Identify columns to be dropped
cols_to_drop = ['Address', 'County Name', 'Phone Number', 'Hospital overall rating footnote',
                'Mortality national comparison footnote', 'Safety of care national comparison footnote',
                'Readmission national comparison footnote', 'Patient experience national comparison footnote',
                'Effectiveness of care national comparison footnote', 'Timeliness of care national comparison footnote',
                'Efficient use of medical imaging national comparison footnote', 'MORT Group Footnote',
                'Safety Group Footnote', 'READM Group Footnote', 'Pt Exp Group Footnote', 'TE Group Footnote']

cms = cms.drop(columns = cols_to_drop)

#### I'm only interested in rows from Mississippi, Louisiana, & Alabama.

In [4]:
# Check the unique states in the dataframe
cms['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'VI', 'WY', 'AS', 'GU',
       'MP'], dtype=object)

In [5]:
# Drop all rows excepts those from MS, LA, & AL
states = ['MS', 'LA', 'AL']

cms = cms[cms['State'].isin(states)]

# Check to see that the subset by states worked
cms['State'].unique()

array(['AL', 'LA', 'MS'], dtype=object)

In [6]:
# Replace state abbrevations in State column with actual state names
state_mapping = {
    'AL': 'Alabama',
    'LA': 'Louisiana',
    'MS': 'Mississippi'
}

cms['State'] = cms['State'].replace(state_mapping)

# Check to see that the replacement worked
cms['State'].unique()

array(['Alabama', 'Louisiana', 'Mississippi'], dtype=object)

#### I want to add a column named `Common State County FIPS Code` to the `cms` dataframe.  Use the *uszipcode* library to get county name, latitude, & longitude.  Then use the *addfips* library to get the county FIPS code based on the state and county name.

In [7]:
# Get the unique ZIP codes in the cms dataframe
zips = cms['ZIP Code'].unique()

# Convert zips to a list of string ZIP codes (because uszipcode library searches based on a string)
zips = zips.astype(str).tolist()

#### Get state, county name, latitude, and longitude for each ZIP code.

In [8]:
# Create a search engine
search = SearchEngine()

# Create empty dictionary to store the state, county, latitude, & longitude for each ZIP code
zip_map = {}

# Look up each ZIP code, and if that ZIP is in the uszipcode database, add relevant info to zip_map
for zip_code in zips:
    info = search.by_zipcode(zip_code)
    if info:
        zip_map[zip_code] = {
            'State': info.state,
            'County': info.county,
            'Latitude': info.lat,
            'Longitude': info.lng
        }

#### Get the county FIPS code for each ZIP code.

In [9]:
# Initialize AddFIPS object called fips_tool
fips_tool = af.AddFIPS()

# For each ZIP code in zip_map, get the county FIPS code
for zip_code in zip_map:
    county = zip_map[zip_code]['County']
    state = zip_map[zip_code]['State']
    county_fips = fips_tool.get_county_fips(county, state)
    # Add the FIPS code to that ZIP code's dictionary in zip_map
    zip_map[zip_code]['Common State County FIPS Code'] = county_fips

#### Add county name, county FIPS code, latitude, & longitude into `cms` dataframe.

In [10]:
# Define a function called lookup that retrieves a value associated with a specific key for each ZIP code in zip_map
def lookup(zip_code, key):
    return zip_map[zip_code][key]

# Define the keys that will be added as columns to cms
keys = ['County', 'Common State County FIPS Code', 'Latitude', 'Longitude']

# For each key, create a new column in cms, & use the apply method to store the evaluated result of lookup() as the column values
for key in keys:
    cms[key] = cms['ZIP Code'].apply(lambda x: lookup(str(x), key))

In [11]:
# Get info about transformed cms dataframe
cms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2704 entries, 0 to 38433
Data columns (total 43 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Year                                                   2704 non-null   int64  
 1   Quarter                                                2704 non-null   object 
 2   Facility ID                                            2704 non-null   object 
 3   Facility Name                                          2704 non-null   object 
 4   City                                                   2704 non-null   object 
 5   State                                                  2704 non-null   object 
 6   ZIP Code                                               2704 non-null   int64  
 7   Hospital Type                                          2704 non-null   object 
 8   Hospital Ownership                             

#### Clean the `Hospital overall rating` column.

In [12]:
# Check the sample size for each rating
cms['Hospital overall rating'].value_counts()

Not Available    1056
3                 676
2                 455
4                 352
1                  95
5                  70
Name: Hospital overall rating, dtype: int64

In [13]:
# Replace 'Not Available' values with NaN
cms['Hospital overall rating'] = cms['Hospital overall rating'].replace('Not Available', pd.NA)

In [14]:
# Check the column to see that the replacement worked
cms['Hospital overall rating'].value_counts()

3    676
2    455
4    352
1     95
5     70
Name: Hospital overall rating, dtype: int64

In [15]:
# Check the datatype of the column
cms['Hospital overall rating'].dtype

dtype('O')

In [16]:
# Change the datatype to Int64 (use Int64 instead of int since there are NaNs)
cms['Hospital overall rating'] = cms['Hospital overall rating'].astype('Int64')

# Check to see that the conversion worked
cms['Hospital overall rating'].dtype

Int64Dtype()

#### Create a column called County HPSA Status, where a hospital in a HPSA-designated county has the value Shortage Area, and a hospital in a county without a HPSA-designation has the value Non-Shortage Area.

In [17]:
shpsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 7823 to 63646
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   HPSA Name                              130 non-null    object        
 1   Designation Type                       130 non-null    object        
 2   HPSA Score                             130 non-null    int64         
 3   HPSA Status                            130 non-null    object        
 4   HPSA Designation Date                  130 non-null    datetime64[ns]
 5   HPSA Designation Last Update Date      130 non-null    datetime64[ns]
 6   HPSA FTE                               130 non-null    float64       
 7   HPSA Designation Population            130 non-null    float64       
 8   % of Population Below 100% Poverty     130 non-null    float64       
 9   HPSA Formal Ratio                      127 non-null    objec

In [18]:
# Get HPSA-designated counties from shpsa dataframe
hpsa_counties = pd.Series(shpsa['Common State County FIPS Code'].unique())

In [19]:
# Create a column County HPSA Status in the cms dataframe with default values of Non-Shortage Area
cms['County HPSA Status'] = 'Non-Shortage Area'

# Check that the column was added with the correct default values
cms['County HPSA Status'].value_counts()

Non-Shortage Area    2704
Name: County HPSA Status, dtype: int64

In [20]:
# Update the County HPSA Status to Shortage Area for hospitals in shortage areas
cms.loc[cms['Common State County FIPS Code'].isin(hpsa_counties), 'County HPSA Status'] = 'Shortage Area'

In [21]:
# Check the counts of Shortage Area vs. Non-Shortage area in the new column
cms['County HPSA Status'].value_counts()

Non-Shortage Area    1646
Shortage Area        1058
Name: County HPSA Status, dtype: int64

#### Left join `cms` with `shpsa` based on county FIPS code, and only join relevant columns from `shpsa`.

In [22]:
# Identify columns of interest from shpsa, then left join cms with shpsa based on county FIPS code
join_cols = ['Common State County FIPS Code', 'Designation Type', 'HPSA Score', 'HPSA FTE',
             'HPSA Designation Population', '% of Population Below 100% Poverty', 'HPSA Formal Ratio',
             'Rural Status', 'HPSA Provider Ratio Goal', 'HPSA Shortage']

cms = cms.merge(shpsa[join_cols], on = 'Common State County FIPS Code', how = 'left')

In [23]:
# Check first few rows
cms.head()

Unnamed: 0,Year,Quarter,Facility ID,Facility Name,City,State,ZIP Code,Hospital Type,Hospital Ownership,Emergency Services,...,County HPSA Status,Designation Type,HPSA Score,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,HPSA Formal Ratio,Rural Status,HPSA Provider Ratio Goal,HPSA Shortage
0,2016,Q4,10001,SOUTHEAST ALABAMA MEDICAL CENTER,DOTHAN,Alabama,36301,Acute Care Hospitals,Government - Hospital District or Authority,Yes,...,Non-Shortage Area,,,,,,,,,
1,2016,Q4,10005,MARSHALL MEDICAL CENTER SOUTH,BOAZ,Alabama,35957,Acute Care Hospitals,Government - Hospital District or Authority,Yes,...,Shortage Area,High Needs Geographic HPSA,10.0,31.48,95375.0,20.5,3030:1,Rural,3000:1,0.31
2,2016,Q4,10006,ELIZA COFFEE MEMORIAL HOSPITAL,FLORENCE,Alabama,35631,Acute Care Hospitals,Government - Hospital District or Authority,Yes,...,Non-Shortage Area,,,,,,,,,
3,2016,Q4,10007,MIZELL MEMORIAL HOSPITAL,OPP,Alabama,36467,Acute Care Hospitals,Voluntary non-profit - Private,Yes,...,Non-Shortage Area,,,,,,,,,
4,2016,Q4,10008,CRENSHAW COMMUNITY HOSPITAL,LUVERNE,Alabama,36049,Acute Care Hospitals,Proprietary,Yes,...,Shortage Area,Geographic HPSA,16.0,1.65,13623.0,15.1,8256:1,Rural,3500:1,2.24


In [24]:
# Get info for final cleaned cms dataframe
cms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2704 entries, 0 to 2703
Data columns (total 53 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Year                                                   2704 non-null   int64  
 1   Quarter                                                2704 non-null   object 
 2   Facility ID                                            2704 non-null   object 
 3   Facility Name                                          2704 non-null   object 
 4   City                                                   2704 non-null   object 
 5   State                                                  2704 non-null   object 
 6   ZIP Code                                               2704 non-null   int64  
 7   Hospital Type                                          2704 non-null   object 
 8   Hospital Ownership                              