# CMS General Hospital Information - Subdomain Scores - Exploratory Data Analysis
---
### Are there differences in CMS Hospital Quality subdomain scores between hospitals in shortage areas versus those in non-shortage areas?
---

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from get_cleaned_cms import cms

In [2]:
# Define custom color palette for all plots by state
custom_palette = {'Mississippi': '#5899c6', 'Louisiana': '#ff9f4b', 'Alabama': '#61b862'}
shortage_palette = {'Non-Shortage Area': '#acac91', 'Shortage Area': '#f64000'}

# Define custom parameters for all plots
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    "font.size": 14,
    "axes.titleweight": "bold",
    "axes.titlesize": 16  
}

sns.set_theme(style = 'ticks', font = 'Arial', rc = custom_params)

In [12]:
# Get the shape of cms
cms.shape

(2704, 53)

In [9]:
# Define the hospital type of interest
acute_types = ['Acute Care Hospitals', 'Acute Care - Veterans Administration', 'Acute Care - Department of Defense']

# Subset the cms dataframe, keeping only rows from acute care hospitals, and dropping rows that have nulls in star rating
acms = cms[cms['Hospital Type'].isin(acute_types)].dropna(subset = ['Hospital overall rating'])

In [14]:
# Get the shape of acms
acms.shape

(1514, 53)

In [15]:
# Get the acms column names
acms.columns 

Index(['Year', 'Quarter', 'Facility ID', 'Facility Name', 'City', 'State',
       'ZIP Code', 'Hospital Type', 'Hospital Ownership', 'Emergency Services',
       'Meets criteria for meaningful use of EHRs', 'Hospital overall rating',
       'Mortality national comparison', 'Safety of care national comparison',
       'Readmission national comparison',
       'Patient experience national comparison',
       'Effectiveness of care national comparison',
       'Timeliness of care national comparison',
       'Efficient use of medical imaging national comparison',
       'Meets criteria for promoting interoperability of EHRs',
       'MORT Group Measure Count', 'Count of Facility MORT Measures',
       'Count of MORT Measures Better', 'Count of MORT Measures No Different',
       'Count of MORT Measures Worse', 'Safety Group Measure Count',
       'Count of Facility Safety Measures', 'Count of Safety Measures Better',
       'Count of Safety Measures No Different',
       'Count of Safety 

In [20]:
# Clean subdomain measures columns
cols_to_edit = ['Mortality national comparison', 'Safety of care national comparison', 'Readmission national comparison',
                'Patient experience national comparison', 'Effectiveness of care national comparison', 'Timeliness of care national comparison',
                'Efficient use of medical imaging national comparison']

# Title-case all values in subdomain measure columns
acms[cols_to_edit] = acms[cols_to_edit].astype(str).applymap(lambda x: x.title())

In [32]:
# Check the values of one of the subdomains for clarity
acms['Efficient use of medical imaging national comparison'].value_counts()

Nan                             519
Same As The National Average    417
Below The National Average      299
Not Available                   235
Above The National Average       44
Name: Efficient use of medical imaging national comparison, dtype: int64

In [27]:
# Export acms dataframe to csv
acms.to_csv('cleaned_data/acms_subdomains_2016-2020.csv', index = False)