# CMS Hospital Quality of Care - Data Cleaning
---

In [1]:
import pandas as pd
from uszipcode import SearchEngine

In [2]:
# Create a list called files for the string paths to the raw data
files = ['raw_data/2016_Hospital_General_Information.csv', 'raw_data/2017_Hospital_General_Information.csv',
         'raw_data/2018_Hospital_General_Information.csv', 'raw_data/2019_Hospital_General_Information.csv',
         'raw_data/2020_Hospital_General_Information.csv', 'raw_data/2021_Hospital_General_Information.csv',
         'raw_data/2022_Hospital_General_Information.csv', 'raw_data/2023_Hospital_General_Information.csv']

# Create an empty list called dfs to hold the dataframes, then read the .csvs and append to dfs
dfs = []
for file in files:
    dfs.append(pd.read_csv(file, encoding = 'iso-8859-1'))

# Union all dataframes in dfs into one df
df = pd.concat(dfs, ignore_index = True)

# Make copy of df called cms; cms will be transformed
cms = df.copy()

#### Drop unnecessary columns

In [3]:
# Identify columns to be dropped
cols_to_drop = ['Address', 'Phone Number', 'Hospital overall rating footnote', 'Mortality national comparison footnote',
                'Safety of care national comparison footnote', 'Readmission national comparison footnote',
                'Patient experience national comparison footnote', 'Effectiveness of care national comparison footnote',
                'Timeliness of care national comparison footnote', 'Efficient use of medical imaging national comparison footnote',
                'MORT Group Footnote', 'Safety Group Footnote', 'READM Group Footnote', 'Pt Exp Group Footnote',
                'TE Group Footnote']

cms = cms.drop(columns = cols_to_drop)

#### I'm only interested in rows from Mississippi, Louisiana, & Alabama.

In [4]:
# Check the unique states in the dataframe
cms['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'VI', 'WY', 'AS', 'GU',
       'MP'], dtype=object)

In [5]:
# Drop all rows excepts those from MS, LA, & AL
states = ['MS', 'LA', 'AL']

cms = cms[cms['State'].isin(states)]

# Check to see that the subset by states worked
cms['State'].unique()

array(['AL', 'LA', 'MS'], dtype=object)

#### I want to add a column named `Common State County FIPS Code` to the `cms` dataframe.  Use the *uszipcode* library to get county name, latitude, & longitude.

In [6]:
# Get the unique ZIP codes in the cms dataframe
zips = cms['ZIP Code'].unique()

# Convert zips to a list of string ZIP codes
zips = zips.astype(str).tolist()

In [7]:
# Create a search engine
search = SearchEngine()

# Create an empty dictionaries to store the county mapping, latitude mapping, & longitude mapping
county_mapping = {}
lat_mapping = {}
lng_mapping = {}

# Look up each ZIP code, and if that ZIP is in the uszipcode database, add relevent info to the dictionaries
for zip_code in zips:
    result = search.by_zipcode(zip_code)
    if result:
        county_mapping[zip_code] = result.county
        lat_mapping[zip_code] = result.lat
        lng_mapping[zip_code] = result.lng

#### Export the `count_mapping` dictionary to a .csv, then manually enter FIPS county code data.

In [8]:
# Export county_mapping dictionary to a .csv
# pd.Series(county_mapping).to_csv('raw_data/fips_mapping.csv')