In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

  from pandas.core import (


In [None]:
CSV_PATH = "FY_2025_Hospital_Readmissions_Reduction_Program_Hospital.csv"  # <- change if needed

na_like = ["N/A", "Too Few to Report", ""]
df = pd.read_csv(CSV_PATH, na_values=na_like)
print(df.head(3))
print(df.shape)

                     Facility Name  Facility ID State        Measure Name  \
0  SOUTHEAST HEALTH MEDICAL CENTER        10001    AL   READM-30-AMI-HRRP   
1  SOUTHEAST HEALTH MEDICAL CENTER        10001    AL  READM-30-CABG-HRRP   
2  SOUTHEAST HEALTH MEDICAL CENTER        10001    AL    READM-30-HF-HRRP   

   Number of Discharges  Footnote  Excess Readmission Ratio  \
0                 296.0       NaN                    0.9483   
1                 151.0       NaN                    0.9509   
2                 681.0       NaN                    1.0597   

   Predicted Readmission Rate  Expected Readmission Rate  \
0                     13.0146                    13.7235   
1                      9.6899                    10.1898   
2                     21.5645                    20.3495   

   Number of Readmissions  Start Date    End Date  
0                    36.0  07/01/2020  06/30/2023  
1                    13.0  07/01/2020  06/30/2023  
2                   151.0  07/01/2020  06

In [3]:
rename_map = {
    "Facility Name": "facility_name",
    "Facility ID": "facility_id",
    "State": "state",
    "Measure Name": "measure_name",
    "Number of Discharges": "discharges",
    "Footnote": "footnote",
    "Excess Readmission Ratio": "excess_readm_ratio",
    "Predicted Readmission Rate": "pred_readm_rate",
    "Expected Readmission Rate": "exp_readm_rate",
    "Number of Readmissions": "num_readmissions",
    "Start Date": "start_date",
    "End Date": "end_date",
}
df = df.rename(columns=rename_map)

In [None]:
# numeric conversions 
num_cols = ["discharges", "excess_readm_ratio", "pred_readm_rate",
            "exp_readm_rate", "num_readmissions"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# dates 
for c in ["start_date", "end_date"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce", format="%m/%d/%Y")

# drop constant date columns 
for c in ["start_date", "end_date"]:
    if c in df.columns and df[c].nunique(dropna=False) <= 1:
        df = df.drop(columns=[c])

In [None]:
print("\nMissing by column:")
print(df.isna().sum().sort_values(ascending=False))

print("\nData types:")
print(df.dtypes)

print("\nQuick numeric summaries:")
print(df[num_cols].describe())

# how many rows have observed ERR (excess_readm_ratio)?
n_total = len(df)
n_err = df["excess_readm_ratio"].notna().sum()
print(f"\nRows with ERR present: {n_err} / {n_total} ({n_err/n_total:.1%})")

# duplicates check (facility_id + measure_name should generally be unique)
dup_mask = df.duplicated(subset=["facility_id", "measure_name"], keep=False)
print(f"Potential duplicates on (facility_id, measure_name): {dup_mask.sum()}")



Missing by column:
footnote              11927
num_readmissions      10389
discharges            10170
excess_readm_ratio     6583
pred_readm_rate        6583
exp_readm_rate         6583
facility_name             0
facility_id               0
state                     0
measure_name              0
dtype: int64

Data types
facility_name          object
facility_id             int64
state                  object
measure_name           object
discharges            float64
footnote              float64
excess_readm_ratio    float64
pred_readm_rate       float64
exp_readm_rate        float64
num_readmissions      float64
dtype: object

Quick numeric summaries:
        discharges  excess_readm_ratio  pred_readm_rate  exp_readm_rate  \
count  8340.000000        11927.000000     11927.000000    11927.000000   
mean    279.269904            1.001719        14.995386       14.961234   
std     266.018069            0.080547         5.017854        4.871997   
min       0.000000            0.477

In [None]:
df_clean = df.copy()
df_clean = df_clean[df_clean["excess_readm_ratio"].notna()].copy()

In [None]:
os.makedirs("figs", exist_ok=True)

# Figure 1: Histogram of Excess Readmission Ratio (overall)
plt.figure()
df_clean["excess_readm_ratio"].plot.hist(bins=40)
plt.title("Distribution of Excess Readmission Ratio (ERR)")
plt.xlabel("ERR")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("figs/fig1_err_hist.png"); plt.close()

# Figure 2: Boxplot of ERR by measure
plt.figure()
# sort measures by median ERR for nicer order
order = (df_clean.groupby("measure_name")["excess_readm_ratio"]
         .median().sort_values().index.tolist())
data_by_measure = [df_clean.loc[df_clean["measure_name"]==m, "excess_readm_ratio"] for m in order]
plt.boxplot(data_by_measure, vert=False, labels=order, showfliers=False)
plt.title("ERR by Measure")
plt.xlabel("ERR")
plt.tight_layout()
plt.savefig("figs/fig2_err_by_measure.png"); plt.close()

# Figure 3: Volume vs ERR (log x-axis)
plt.figure()
plt.scatter(df_vol["discharges"], df_vol["excess_readm_ratio"], alpha=0.4)
plt.xscale("log")
plt.title("Hospital Volume vs ERR")
plt.xlabel("Number of Discharges (log scale)")
plt.ylabel("ERR")
plt.tight_layout()
plt.savefig("figs/fig3_volume_vs_err.png"); plt.close()

# Figure 4: Average ERR by state (bar chart)
state_mean = (df_clean.groupby("state", dropna=False)["excess_readm_ratio"]
              .mean().sort_values())
plt.figure(figsize=(6, max(4, len(state_mean)*0.18)))
plt.barh(state_mean.index.astype(str), state_mean.values)
plt.title("Average ERR by State")
plt.xlabel("Mean ERR"); plt.ylabel("State")
plt.tight_layout()
plt.savefig("figs/fig4_state_mean_err.png"); plt.close()


  plt.boxplot(data_by_measure, vert=False, labels=order, showfliers=False)
