# CMS General Hospital Information - Subdomain Scores - Exploratory Data Analysis
---
### Are there differences in CMS Hospital Quality subdomain scores between hospitals in shortage areas versus those in non-shortage areas?
---

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from get_cleaned_cms import cms
from get_cleaned_toc import toc

In [2]:
# Define custom color palette for all plots by state
custom_palette = {'Mississippi': '#5899c6', 'Louisiana': '#ff9f4b', 'Alabama': '#61b862'}
shortage_palette = {'Non-Shortage Area': '#acac91', 'Shortage Area': '#f64000'}

# Define custom parameters for all plots
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    "font.size": 14,
    "axes.titleweight": "bold",
    "axes.titlesize": 16  
}

sns.set_theme(style = 'ticks', font = 'Arial', rc = custom_params)

#### Create `acms` dataframe for acute care hospital data.

In [3]:
# Define the hospital type of interest
acute_types = ['Acute Care Hospitals', 'Acute Care - Veterans Administration', 'Acute Care - Department of Defense']

# Subset the cms dataframe, keeping only rows from acute care hospitals, and dropping rows that have nulls in star rating
acms = cms[cms['Hospital Type'].isin(acute_types)].dropna(subset = ['Hospital overall rating'])

#### Title-case each of the subdomain measure names.

In [4]:
# Clean subdomain measures columns
cols_to_edit = ['Mortality national comparison', 'Safety of care national comparison', 'Readmission national comparison',
                'Patient experience national comparison', 'Effectiveness of care national comparison', 'Timeliness of care national comparison',
                'Efficient use of medical imaging national comparison']

# Title-case all values in subdomain measure columns
acms[cols_to_edit] = acms[cols_to_edit].astype(str).applymap(lambda x: x.title())

In [5]:
# Check the values of one of the subdomains for clarity
acms['Efficient use of medical imaging national comparison'].value_counts()

Nan                             519
Same As The National Average    417
Below The National Average      299
Not Available                   235
Above The National Average       44
Name: Efficient use of medical imaging national comparison, dtype: int64

In [6]:
# Export acms dataframe to csv
# acms.to_csv('cleaned_data/acms_subdomains_2016-2020.csv', index = False)

### Exploring the `toc` (Timeliness of Care) dataframe.

In [7]:
# Get info about toc
toc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5781 entries, 0 to 5780
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                5781 non-null   int64  
 1   Quarter                             5781 non-null   object 
 2   Facility ID                         5781 non-null   object 
 3   Facility Name                       5781 non-null   object 
 4   City                                5781 non-null   object 
 5   State                               5781 non-null   object 
 6   ZIP Code                            5781 non-null   int64  
 7   County Name                         5781 non-null   object 
 8   Condition                           5781 non-null   object 
 9   Measure ID                          5781 non-null   object 
 10  Measure Name                        5781 non-null   object 
 11  Score                               5781 no

In [8]:
# Get names of TOC measures
toc['Measure Name'].unique()

array(['Median Admit Decision Time to ED Departure Time As Inpatient',
       'Median Time Spent in ED Before Leaving',
       'Median Time Spent in ED Before Seen by Health Professional',
       'Median Time to Pain Medicine',
       '% of Patients Left Before Being Seen'], dtype=object)

#### Get sample size & descriptive statistics for `Median Time Spent in ED Before Leaving`.

In [9]:
# Get sample size
measure = toc[toc['Measure Name'] == 'Median Time Spent in ED Before Leaving']

facility_groups = measure.groupby(['State', 'County HPSA Status', 'Facility ID'])

facility_groups['Score'].describe().reset_index().groupby(['State', 'County HPSA Status'])['Facility ID'].count()

State        County HPSA Status
Alabama      Non-Shortage Area     45
             Shortage Area         41
Louisiana    Non-Shortage Area     59
             Shortage Area         18
Mississippi  Non-Shortage Area     32
             Shortage Area         22
Name: Facility ID, dtype: int64

In [10]:
# Get descriptive statistics
measure.groupby(['State', 'County HPSA Status'])['Score'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,County HPSA Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Non-Shortage Area,354.0,157.878531,39.004023,73.0,125.0,156.5,183.75,293.0
Alabama,Shortage Area,303.0,117.673267,25.870708,50.0,102.0,115.0,132.0,204.0
Louisiana,Non-Shortage Area,427.0,144.093677,38.808736,61.0,119.0,140.0,163.5,344.0
Louisiana,Shortage Area,137.0,112.211679,30.076234,62.0,90.0,112.0,126.0,216.0
Mississippi,Non-Shortage Area,252.0,140.646825,36.844865,71.0,114.0,135.0,161.0,323.0
Mississippi,Shortage Area,162.0,118.012346,46.617512,70.0,98.25,114.5,130.0,630.0


#### Get sample size & descriptive statistics for `Median Time Spent in ED Before Seen by Health Professional`.

In [11]:
# Get sample size
measure = toc[toc['Measure Name'] == 'Median Time Spent in ED Before Seen by Health Professional']

facility_groups = measure.groupby(['State', 'County HPSA Status', 'Facility ID'])

facility_groups['Score'].describe().reset_index().groupby(['State', 'County HPSA Status'])['Facility ID'].count()

State        County HPSA Status
Alabama      Non-Shortage Area     45
             Shortage Area         39
Louisiana    Non-Shortage Area     57
             Shortage Area         17
Mississippi  Non-Shortage Area     31
             Shortage Area         20
Name: Facility ID, dtype: int64

In [12]:
# Get descriptive statistics
measure.groupby(['State', 'County HPSA Status'])['Score'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,County HPSA Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Non-Shortage Area,135.0,29.6,13.469811,3.0,18.5,29.0,38.0,70.0
Alabama,Shortage Area,115.0,26.513043,10.579163,4.0,19.0,26.0,33.5,54.0
Louisiana,Non-Shortage Area,167.0,23.401198,12.95035,3.0,13.0,22.0,32.0,78.0
Louisiana,Shortage Area,51.0,18.117647,8.728452,4.0,12.0,18.0,22.0,39.0
Mississippi,Non-Shortage Area,93.0,26.924731,13.818101,3.0,18.0,23.0,32.0,72.0
Mississippi,Shortage Area,60.0,24.533333,13.725476,6.0,12.0,21.0,34.5,55.0


#### Get sample size & descriptive statistics for `Median Time to Pain Medicine`.

In [13]:
# Get sample size
measure = toc[toc['Measure Name'] == 'Median Time to Pain Medicine']

facility_groups = measure.groupby(['State', 'County HPSA Status', 'Facility ID'])

facility_groups['Score'].describe().reset_index().groupby(['State', 'County HPSA Status'])['Facility ID'].count()

State        County HPSA Status
Alabama      Non-Shortage Area     44
             Shortage Area         36
Louisiana    Non-Shortage Area     56
             Shortage Area         16
Mississippi  Non-Shortage Area     31
             Shortage Area         18
Name: Facility ID, dtype: int64

In [14]:
# Get descriptive statistics
measure.groupby(['State', 'County HPSA Status'])['Score'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,County HPSA Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Non-Shortage Area,201.0,74.58209,21.553062,19.0,60.0,78.0,94.0,104.0
Alabama,Shortage Area,201.0,73.940299,22.358363,13.0,56.0,78.0,95.0,100.0
Louisiana,Non-Shortage Area,231.0,64.709957,24.753706,17.0,45.5,62.0,91.0,125.0
Louisiana,Shortage Area,86.0,70.837209,25.786846,21.0,49.25,78.0,95.0,100.0
Mississippi,Non-Shortage Area,149.0,73.409396,20.521911,25.0,58.0,73.0,94.0,100.0
Mississippi,Shortage Area,102.0,74.666667,22.187024,24.0,57.0,80.0,93.0,111.0


#### Get sample size & descriptive statistics for `% of Patients Left Before Being Seen`.

In [15]:
# Get sample size
measure = toc[toc['Measure Name'] == '% of Patients Left Before Being Seen']

facility_groups = measure.groupby(['State', 'County HPSA Status', 'Facility ID'])

facility_groups['Score'].describe().reset_index().groupby(['State', 'County HPSA Status'])['Facility ID'].count()

State        County HPSA Status
Alabama      Non-Shortage Area     45
             Shortage Area         41
Louisiana    Non-Shortage Area     58
             Shortage Area         18
Mississippi  Non-Shortage Area     32
             Shortage Area         21
Name: Facility ID, dtype: int64

In [16]:
# Get descriptive statistics
measure.groupby(['State', 'County HPSA Status'])['Score'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,County HPSA Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Non-Shortage Area,337.0,2.362018,1.774349,0.0,1.0,2.0,3.0,14.0
Alabama,Shortage Area,283.0,2.219081,1.768937,0.0,1.0,2.0,3.0,19.0
Louisiana,Non-Shortage Area,401.0,2.194514,1.785237,0.0,1.0,2.0,3.0,13.0
Louisiana,Shortage Area,127.0,1.771654,1.310452,0.0,1.0,2.0,2.0,7.0
Mississippi,Non-Shortage Area,243.0,2.320988,1.565331,0.0,1.0,2.0,3.0,8.0
Mississippi,Shortage Area,144.0,2.597222,2.330459,0.0,1.0,2.0,3.0,12.0


#### Get sample size & descriptive statistics for `Median Admit Decision Time to ED Departure Time As Inpatient`.

In [17]:
# Get sample size
measure = toc[toc['Measure Name'] == 'Median Admit Decision Time to ED Departure Time As Inpatient']

facility_groups = measure.groupby(['State', 'County HPSA Status', 'Facility ID'])

facility_groups['Score'].describe().reset_index().groupby(['State', 'County HPSA Status'])['Facility ID'].count()

State        County HPSA Status
Alabama      Non-Shortage Area     45
             Shortage Area         39
Louisiana    Non-Shortage Area     57
             Shortage Area         17
Mississippi  Non-Shortage Area     32
             Shortage Area         21
Name: Facility ID, dtype: int64

In [18]:
# Get descriptive statistics
measure.groupby(['State', 'County HPSA Status'])['Score'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,County HPSA Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Non-Shortage Area,221.0,102.239819,53.917036,15.0,67.0,89.0,120.0,318.0
Alabama,Shortage Area,187.0,62.930481,22.117732,14.0,49.0,61.0,73.0,161.0
Louisiana,Non-Shortage Area,271.0,112.542435,52.015142,11.0,75.0,105.0,150.5,284.0
Louisiana,Shortage Area,81.0,62.716049,25.175501,9.0,48.0,60.0,74.0,188.0
Mississippi,Non-Shortage Area,156.0,82.416667,36.656374,27.0,58.75,79.0,94.25,242.0
Mississippi,Shortage Area,104.0,55.384615,22.670415,10.0,44.0,53.0,66.25,136.0
