# CMS General Hospital Information - Subdomain Scores - Exploratory Data Analysis
---
### Are there differences in CMS Hospital Quality subdomain scores between hospitals in shortage areas versus those in non-shortage areas?
---

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from get_cleaned_cms import cms
from get_cleaned_toc import toc

In [2]:
# Define custom color palette for all plots by state
custom_palette = {'Mississippi': '#5899c6', 'Louisiana': '#ff9f4b', 'Alabama': '#61b862'}
shortage_palette = {'Non-Shortage Area': '#acac91', 'Shortage Area': '#f64000'}

# Define custom parameters for all plots
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    "font.size": 14,
    "axes.titleweight": "bold",
    "axes.titlesize": 16  
}

sns.set_theme(style = 'ticks', font = 'Arial', rc = custom_params)

#### Create `acms` dataframe for acute care hospital data.

In [3]:
# Define the hospital type of interest
acute_types = ['Acute Care Hospitals', 'Acute Care - Veterans Administration', 'Acute Care - Department of Defense']

# Subset the cms dataframe, keeping only rows from acute care hospitals, and dropping rows that have nulls in star rating
acms = cms[cms['Hospital Type'].isin(acute_types)].dropna(subset = ['Hospital overall rating'])

#### Title-case each of the subdomain measure names.

In [4]:
# Clean subdomain measures columns
cols_to_edit = ['Mortality national comparison', 'Safety of care national comparison', 'Readmission national comparison',
                'Patient experience national comparison', 'Effectiveness of care national comparison', 'Timeliness of care national comparison',
                'Efficient use of medical imaging national comparison']

# Title-case all values in subdomain measure columns
acms[cols_to_edit] = acms[cols_to_edit].astype(str).applymap(lambda x: x.title())

In [5]:
# Check the values of one of the subdomains for clarity
acms['Efficient use of medical imaging national comparison'].value_counts()

Nan                             519
Same As The National Average    417
Below The National Average      299
Not Available                   235
Above The National Average       44
Name: Efficient use of medical imaging national comparison, dtype: int64

In [6]:
# Export acms dataframe to csv
# acms.to_csv('cleaned_data/acms_subdomains_2016-2020.csv', index = False)

### Exploring the `toc` (Timeliness of Care) dataframe.

In [7]:
# Get info about toc
toc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5376 entries, 0 to 5375
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                5376 non-null   int64  
 1   Quarter                             5376 non-null   object 
 2   Facility ID                         5376 non-null   object 
 3   Facility Name                       5376 non-null   object 
 4   City                                5376 non-null   object 
 5   State                               5376 non-null   object 
 6   ZIP Code                            5376 non-null   int64  
 7   County Name                         5376 non-null   object 
 8   Condition                           5376 non-null   object 
 9   Measure ID                          5376 non-null   object 
 10  Measure Name                        5376 non-null   object 
 11  Score                               5376 no