# CMS Hospital Quality of Care - Data Cleaning
---

In [1]:
import pandas as pd
import addfips as af
from uszipcode import SearchEngine

In [2]:
# Create a list called files for the string paths to the raw data
files = ['raw_data/2016_Hospital_General_Information.csv', 'raw_data/2017_Hospital_General_Information.csv',
         'raw_data/2018_Hospital_General_Information.csv', 'raw_data/2019_Hospital_General_Information.csv',
         'raw_data/2020_Hospital_General_Information.csv', 'raw_data/2021_Hospital_General_Information.csv',
         'raw_data/2022_Hospital_General_Information.csv', 'raw_data/2023_Hospital_General_Information.csv']

# Create an empty list called dfs to hold the dataframes, then read the .csvs and append to dfs
dfs = []
for file in files:
    dfs.append(pd.read_csv(file, encoding = 'iso-8859-1'))

# Union all dataframes in dfs into one df
df = pd.concat(dfs, ignore_index = True)

# Make copy of df called cms; cms will be transformed
cms = df.copy()

#### Drop unnecessary columns

In [3]:
# Identify columns to be dropped
cols_to_drop = ['Address', 'County Name', 'Phone Number', 'Hospital overall rating footnote',
                'Mortality national comparison footnote', 'Safety of care national comparison footnote',
                'Readmission national comparison footnote', 'Patient experience national comparison footnote',
                'Effectiveness of care national comparison footnote', 'Timeliness of care national comparison footnote',
                'Efficient use of medical imaging national comparison footnote', 'MORT Group Footnote',
                'Safety Group Footnote', 'READM Group Footnote', 'Pt Exp Group Footnote', 'TE Group Footnote']

cms = cms.drop(columns = cols_to_drop)

#### I'm only interested in rows from Mississippi, Louisiana, & Alabama.

In [4]:
# Check the unique states in the dataframe
cms['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'VI', 'WY', 'AS', 'GU',
       'MP'], dtype=object)

In [5]:
# Drop all rows excepts those from MS, LA, & AL
states = ['MS', 'LA', 'AL']

cms = cms[cms['State'].isin(states)]

# Check to see that the subset by states worked
cms['State'].unique()

array(['AL', 'LA', 'MS'], dtype=object)

#### I want to add a column named `Common State County FIPS Code` to the `cms` dataframe.  Use the *uszipcode* library to get county name, latitude, & longitude.  Then use the *addfips* library to get the county FIPS code based on the state and county name.

In [6]:
# Get the unique ZIP codes in the cms dataframe
zips = cms['ZIP Code'].unique()

# Convert zips to a list of string ZIP codes (because uszipcode library searches based on a string)
zips = zips.astype(str).tolist()

#### Get state, county name, latitude, and longitude for each ZIP code.

In [7]:
# Create a search engine
search = SearchEngine()

# Create empty dictionary to store the state, county, latitude, & longitude for each ZIP code
zip_map = {}

# Look up each ZIP code, and if that ZIP is in the uszipcode database, add relevant info to zip_map
for zip_code in zips:
    info = search.by_zipcode(zip_code)
    if info:
        zip_map[zip_code] = {
            'State': info.state,
            'County': info.county,
            'Latitude': info.lat,
            'Longitude': info.lng
        }

#### Get the county FIPS code for each ZIP code.

In [8]:
# Initialize AddFIPS object called fips_tool
fips_tool = af.AddFIPS()

# For each ZIP code in zip_map, get the county FIPS code
for zip_code in zip_map:
    county = zip_map[zip_code]['County']
    state = zip_map[zip_code]['State']
    county_fips = fips_tool.get_county_fips(county, state)
    # Add the FIPS code to that ZIP code's dictionary in zip_map
    zip_map[zip_code]['Common State County FIPS Code'] = county_fips

#### Add county name, county FIPS code, latitude, & longitude into `cms` dataframe.

In [9]:
# Define a function called lookup that retrieves a value associated with a specific key for each ZIP code in zip_map
def lookup(zip_code, key):
    return zip_map[zip_code][key]

# Define the keys that will be added as columns to cms
keys = ['County', 'Common State County FIPS Code', 'Latitude', 'Longitude']

# For each key, create a new column in cms, & use the apply method to store the evaluated result of lookup() as the column values
for key in keys:
    cms[key] = cms['ZIP Code'].apply(lambda x: lookup(str(x), key))

In [10]:
# Get info about transformed cms dataframe
cms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2704 entries, 0 to 38433
Data columns (total 43 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Year                                                   2704 non-null   int64  
 1   Quarter                                                2704 non-null   object 
 2   Facility ID                                            2704 non-null   object 
 3   Facility Name                                          2704 non-null   object 
 4   City                                                   2704 non-null   object 
 5   State                                                  2704 non-null   object 
 6   ZIP Code                                               2704 non-null   int64  
 7   Hospital Type                                          2704 non-null   object 
 8   Hospital Ownership                             

#### Clean the `Hospital overall rating` column.

In [11]:
# Check the sample size for each rating
cms['Hospital overall rating'].value_counts()

Not Available    1056
3                 676
2                 455
4                 352
1                  95
5                  70
Name: Hospital overall rating, dtype: int64

In [12]:
# Replace 'Not Available' values with NaN
cms['Hospital overall rating'] = cms['Hospital overall rating'].replace('Not Available', pd.NA)

In [13]:
# Check the column to see that the replacement worked
cms['Hospital overall rating'].value_counts()

3    676
2    455
4    352
1     95
5     70
Name: Hospital overall rating, dtype: int64

In [14]:
# Check the datatype of the column
cms['Hospital overall rating'].dtype

dtype('O')

In [15]:
# Change the datatype to Int64 (use Int64 instead of int since there are NaNs)
cms['Hospital overall rating'] = cms['Hospital overall rating'].astype('Int64')

# Check to see that the conversion worked
cms['Hospital overall rating'].dtype

Int64Dtype()