# Part 5: Exploratory Analysis

In [309]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from dotenv import load_dotenv
from datetime import date, datetime, timedelta
import glob
from ast import literal_eval
from collections import Counter
from tqdm.auto import tqdm
import pyarrow as pa

In [310]:
drg = pd.read_parquet('D://Vignesh/Capstone/combined/drg_parquet/drg.parquet', engine='pyarrow')
cms = pd.read_csv('Hospital_Metrics/Medicare_Inpatient_Hospital_by_Provider_and_Service_2020.csv', encoding='windows-1252')

In [311]:
drg.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,MS-DRG,1,135000.0,50327
1,MS-DRG,2,135000.0,50327
2,MS-DRG,3,9000.0,50327
3,MS-DRG,4,9000.0,50327
4,MS-DRG,5,135095.0,50327


In [312]:
drg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9777 entries, 0 to 9776
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   billing_type      9777 non-null   object 
 1   billing_code      9777 non-null   object 
 2   negotiated_rates  9777 non-null   float64
 3   ccn               9777 non-null   object 
dtypes: float64(1), object(3)
memory usage: 305.7+ KB


In [313]:
cms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165281 entries, 0 to 165280
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Rndrng_Prvdr_CCN           165281 non-null  int64  
 1   Rndrng_Prvdr_Org_Name      165281 non-null  object 
 2   Rndrng_Prvdr_St            165281 non-null  object 
 3   Rndrng_Prvdr_City          165281 non-null  object 
 4   Rndrng_Prvdr_State_Abrvtn  165281 non-null  object 
 5   Rndrng_Prvdr_State_FIPS    165281 non-null  int64  
 6   Rndrng_Prvdr_Zip5          165281 non-null  int64  
 7   Rndrng_Prvdr_RUCA          165281 non-null  float64
 8   Rndrng_Prvdr_RUCA_Desc     165281 non-null  object 
 9   DRG_Cd                     165281 non-null  int64  
 10  DRG_Desc                   165281 non-null  object 
 11  Tot_Dschrgs                165281 non-null  int64  
 12  Avg_Submtd_Cvrd_Chrg       165281 non-null  float64
 13  Avg_Tot_Pymt_Amt           16

In [314]:
drg = drg.astype({'billing_code': 'str','ccn':'int64'})
drg['billing_code'] = drg.apply(lambda row: row['billing_code'] if row['billing_code'].isdigit() else None, axis=1)

drg.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,MS-DRG,1,135000.0,50327
1,MS-DRG,2,135000.0,50327
2,MS-DRG,3,9000.0,50327
3,MS-DRG,4,9000.0,50327
4,MS-DRG,5,135095.0,50327


In [315]:
drg.dropna(subset=['billing_code'],inplace=True)
drg = drg.astype({'billing_code': 'int64'})

In [316]:
merged = drg.merge(cms, how='inner', left_on=['ccn','billing_code'], right_on=['Rndrng_Prvdr_CCN','DRG_Cd'])

In [317]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 799 entries, 0 to 798
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   billing_type               799 non-null    object 
 1   billing_code               799 non-null    int64  
 2   negotiated_rates           799 non-null    float64
 3   ccn                        799 non-null    int64  
 4   Rndrng_Prvdr_CCN           799 non-null    int64  
 5   Rndrng_Prvdr_Org_Name      799 non-null    object 
 6   Rndrng_Prvdr_St            799 non-null    object 
 7   Rndrng_Prvdr_City          799 non-null    object 
 8   Rndrng_Prvdr_State_Abrvtn  799 non-null    object 
 9   Rndrng_Prvdr_State_FIPS    799 non-null    int64  
 10  Rndrng_Prvdr_Zip5          799 non-null    int64  
 11  Rndrng_Prvdr_RUCA          799 non-null    float64
 12  Rndrng_Prvdr_RUCA_Desc     799 non-null    object 
 13  DRG_Cd                     799 non-null    int64  

In [318]:
merged.head(10)

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn,Rndrng_Prvdr_CCN,Rndrng_Prvdr_Org_Name,Rndrng_Prvdr_St,Rndrng_Prvdr_City,Rndrng_Prvdr_State_Abrvtn,Rndrng_Prvdr_State_FIPS,Rndrng_Prvdr_Zip5,Rndrng_Prvdr_RUCA,Rndrng_Prvdr_RUCA_Desc,DRG_Cd,DRG_Desc,Tot_Dschrgs,Avg_Submtd_Cvrd_Chrg,Avg_Tot_Pymt_Amt,Avg_Mdcr_Pymt_Amt
0,MS-DRG,3,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,3,"""ECMO OR TRACHEOSTOMY WITH MV >96 HOURS OR PRI...",23,1090380.0,211395.52174,179499.56522
1,MS-DRG,4,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,4,"TRACH W MV >96 HRS OR PDX EXC FACE, MOUTH & NE...",18,946924.1,153933.11111,138166.72222
2,MS-DRG,5,135095.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,5,LIVER TRANSPLANT WITH MCC OR INTESTINAL TRANSP...,20,1083387.0,143484.3,129179.25
3,MS-DRG,11,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,11,"TRACHEOSTOMY FOR FACE, MOUTH & NECK DIAGNOSES ...",12,394920.2,68277.75,59581.916667
4,MS-DRG,23,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,23,CRANIOTOMY W MAJOR DEVICE IMPLANT OR ACUTE COM...,21,382940.8,66471.142857,59667.809524
5,MS-DRG,25,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,25,CRANIOTOMY & ENDOVASCULAR INTRACRANIAL PROCEDU...,26,361781.9,50147.307692,45107.153846
6,MS-DRG,27,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,27,CRANIOTOMY & ENDOVASCULAR INTRACRANIAL PROCEDU...,21,163565.6,26104.333333,23466.095238
7,MS-DRG,64,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,64,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,43,198208.9,26364.906977,21787.767442
8,MS-DRG,65,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,65,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,30,81484.13,11613.433333,9173.8
9,MS-DRG,70,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,70,NONSPECIFIC CEREBROVASCULAR DISORDERS W MCC,19,136101.9,20190.473684,15949.315789


In [319]:
merged['ccn'].nunique()

15

| Code Type | Code Value | Description | Associated CMS Metric | Metric Details
| --- | --- | --- | --- | --- |
| CPT | 36556 | Under Insertion of Central Venous Access Device | HAI_1 | Central Line Associated Bloodstream Infection
| CPT | 51701 | Under Introduction Procedures on the Bladder | HAI_2 | Catheter Associated Urinary Tract Infections
| CPT | 51702 | Under Introduction Procedures on the Bladder | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | A4314 | Insertion tray with drainage bag with indwelling catheter, Foley type, 2-way latex with coating (Teflon, silicone, silicone elastomer or hydrophilic, etc.) | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | A4315 | Insertion tray with drainage bag with indwelling catheter, Foley type, 2-way, all silicone | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | G9312 | Surgical site infection | HAI_3 | Surgical Site Infection - Colon Surgery
| CPT | 58150 | Under Hysterectomy Procedures | HAI_4 | Surgical Site Infection - Abdominal Hysterectomy
| CPT | 15920 | Under Pressure Ulcers (Decubitus Ulcers) Procedures | PSI-3 | Pressure Ulcer Rate
| CPT | 35800 | Under Repair, Excision, Exploration, Revision Procedures on Arteries and Veins | PSI-9 | Postoperative hemorrhage or hematoma rate
| HCPCS | J1650 | Injection, enoxaparin sodium, 10 mg | PSI-12 | Perioperative pulmonary embolism or deep vein thrombosis rate
| HCPCS | C9604 | Percutaneous transluminal revascularization of or through coronary artery bypass graft (internal mammary, free arterial, venous), any combination of drug-eluting intracoronary stent, atherectomy and angioplasty, including distal protection when performed | MORT_30_CABG | Death rate for CABG surgery patients


PSI 90 is a composite metric of all PSI scores (patient safety scores). Since it should be highly correlated with all surgeries, I think it would be appropriate to add for CCN availible.

In [320]:
code_values = [['CPT','36556'],['CPT','51701'],['CPT','51702'],['HCPCS','A4314'],
        ['HCPCS','A4315'],['HCPCS','G9312'],['CPT','58150'],['CPT','15920'],
        ['CPT','35800'],['HCPCS','J1650'],['HCPCS','C9604']]

cpt = pd.read_parquet('D:\Vignesh\Capstone\combined\cpt_parquet\cpt.parquet')
hcpcs = pd.read_parquet('D:\Vignesh\Capstone\combined\hcpcs_parquet\hcpcs.parquet')

In [321]:
cpt.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,CPT,0001A,16.94,290003
1,CPT,0001U,576.0,290003
2,CPT,0002A,28.39,290003
3,CPT,0002M,402.72,290003
4,CPT,0002U,20.0,290003


In [322]:
cpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14771907 entries, 0 to 14771906
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   billing_type      object 
 1   billing_code      object 
 2   negotiated_rates  float64
 3   ccn               object 
dtypes: float64(1), object(3)
memory usage: 450.8+ MB


In [323]:
df = pd.DataFrame(columns=['billing_type','billing_code','negotiated_rates','ccn'])

for [code_type, value] in code_values:
    if code_type == 'CPT':
        temp = cpt[cpt['billing_code'] == value]
        df = pd.concat([df,temp], axis=0, ignore_index=True)
    else:
        temp = hcpcs[hcpcs['billing_code'] == value]
        df = pd.concat([df,temp], axis=0, ignore_index=True)

In [324]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12530 entries, 0 to 12529
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   billing_type      12530 non-null  object 
 1   billing_code      12530 non-null  object 
 2   negotiated_rates  12530 non-null  float64
 3   ccn               12530 non-null  object 
dtypes: float64(1), object(3)
memory usage: 391.7+ KB


In [325]:
df.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,CPT,36556,221.3,290003
1,CPT,36556,239.31,290039
2,CPT,36556,92.87,290039
3,CPT,36556,309.82,290003
4,CPT,36556,221.3,290007


In [326]:
infections = pd.read_csv('Hospital_Metrics\Healthcare_Associated_Infections-Hospital.csv', 
                         usecols=['Facility ID','Measure ID','Score'])

In [327]:
infections.head()

Unnamed: 0,Facility ID,Measure ID,Score
0,10001,HAI_1_CILOWER,0.289
1,10001,HAI_1_CIUPPER,1.307
2,10001,HAI_1_DOPC,10024.0
3,10001,HAI_1_ELIGCASES,10.597
4,10001,HAI_1_NUMERATOR,7.0


In [328]:
inf_piv = infections.pivot(index='Facility ID',columns='Measure ID', values='Score').reset_index()
inf_piv.head()

Measure ID,Facility ID,HAI_1_CILOWER,HAI_1_CIUPPER,HAI_1_DOPC,HAI_1_ELIGCASES,HAI_1_NUMERATOR,HAI_1_SIR,HAI_2_CILOWER,HAI_2_CIUPPER,HAI_2_DOPC,...,HAI_5_DOPC,HAI_5_ELIGCASES,HAI_5_NUMERATOR,HAI_5_SIR,HAI_6_CILOWER,HAI_6_CIUPPER,HAI_6_DOPC,HAI_6_ELIGCASES,HAI_6_NUMERATOR,HAI_6_SIR
0,10001,0.289,1.307,10024,10.597,7,0.661,0.140,0.570,17731,...,101908,9.412,8,0.850,0.492,0.868,101451,72.686,48,0.660
1,10005,1.791,6.741,3713,2.45,9,3.673,0.487,2.498,8670,...,38413,1.999,0,0.000,0.419,1.575,35686,10.484,9,0.858
2,10006,0.307,1.575,7318,7.924,6,0.757,0.050,0.534,11755,...,62709,4.164,6,1.441,0.015,0.292,54159,22.618,2,0.088
3,10007,Not Available,Not Available,268,0.165,0,Not Available,Not Available,Not Available,1417,...,5484,0.132,0,Not Available,0.023,2.296,5413,2.148,1,0.466
4,10008,Not Available,Not Available,14,0.008,0,Not Available,Not Available,Not Available,488,...,2171,0.051,0,Not Available,Not Available,Not Available,2171,0.398,0,Not Available


In [329]:
col = inf_piv.columns.to_list()
col

['Facility ID',
 'HAI_1_CILOWER',
 'HAI_1_CIUPPER',
 'HAI_1_DOPC',
 'HAI_1_ELIGCASES',
 'HAI_1_NUMERATOR',
 'HAI_1_SIR',
 'HAI_2_CILOWER',
 'HAI_2_CIUPPER',
 'HAI_2_DOPC',
 'HAI_2_ELIGCASES',
 'HAI_2_NUMERATOR',
 'HAI_2_SIR',
 'HAI_3_CILOWER',
 'HAI_3_CIUPPER',
 'HAI_3_DOPC',
 'HAI_3_ELIGCASES',
 'HAI_3_NUMERATOR',
 'HAI_3_SIR',
 'HAI_4_CILOWER',
 'HAI_4_CIUPPER',
 'HAI_4_DOPC',
 'HAI_4_ELIGCASES',
 'HAI_4_NUMERATOR',
 'HAI_4_SIR',
 'HAI_5_CILOWER',
 'HAI_5_CIUPPER',
 'HAI_5_DOPC',
 'HAI_5_ELIGCASES',
 'HAI_5_NUMERATOR',
 'HAI_5_SIR',
 'HAI_6_CILOWER',
 'HAI_6_CIUPPER',
 'HAI_6_DOPC',
 'HAI_6_ELIGCASES',
 'HAI_6_NUMERATOR',
 'HAI_6_SIR']

In [330]:
inf_piv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4848 entries, 0 to 4847
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Facility ID      4848 non-null   object
 1   HAI_1_CILOWER    4848 non-null   object
 2   HAI_1_CIUPPER    4848 non-null   object
 3   HAI_1_DOPC       4848 non-null   object
 4   HAI_1_ELIGCASES  4848 non-null   object
 5   HAI_1_NUMERATOR  4848 non-null   object
 6   HAI_1_SIR        4848 non-null   object
 7   HAI_2_CILOWER    4848 non-null   object
 8   HAI_2_CIUPPER    4848 non-null   object
 9   HAI_2_DOPC       4848 non-null   object
 10  HAI_2_ELIGCASES  4848 non-null   object
 11  HAI_2_NUMERATOR  4848 non-null   object
 12  HAI_2_SIR        4848 non-null   object
 13  HAI_3_CILOWER    4848 non-null   object
 14  HAI_3_CIUPPER    4848 non-null   object
 15  HAI_3_DOPC       4848 non-null   object
 16  HAI_3_ELIGCASES  4848 non-null   object
 17  HAI_3_NUMERATOR  4848 non-null   

In [331]:
safety = pd.read_csv('Hospital_Metrics\CMS_PSI_6_decimal_file.csv', 
                         usecols=['Facility ID','Measure ID','Rate'])

In [332]:
safety.head()

Unnamed: 0,Facility ID,Measure ID,Rate
0,10001,PSI_03,0.231881
1,10001,PSI_06,0.167117
2,10001,PSI_08,0.095461
3,10001,PSI_09,2.330697
4,10001,PSI_10,0.60954


In [333]:
saf_piv = safety.pivot(index='Facility ID',columns='Measure ID',values='Rate').reset_index()
saf_piv.head()

Measure ID,Facility ID,PSI_03,PSI_06,PSI_08,PSI_09,PSI_10,PSI_11,PSI_12,PSI_13,PSI_14,PSI_15,PSI_90
0,10001,0.231881,0.167117,0.095461,2.330697,0.609540,8.923957,3.328297,5.980974,0.649012,1.209750,1.005236
1,10005,0.859076,0.173883,0.057663,2.075276,0.756613,6.869380,2.542284,3.440264,0.762277,0.866989,0.908322
2,10006,1.829317,0.261801,0.046622,3.457041,0.651895,3.893786,2.798522,3.721339,0.675144,1.334372,1.099465
3,10007,0.319664,0.183425,0.072035,2.366388,0.914303,6.005962,3.799432,4.048369,Not Available,1.018783,0.993102
4,10008,0.498143,0.185437,0.073861,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available


In [334]:
measure = inf_piv.merge(saf_piv, how='inner', on='Facility ID')
measure.head()

Measure ID,Facility ID,HAI_1_CILOWER,HAI_1_CIUPPER,HAI_1_DOPC,HAI_1_ELIGCASES,HAI_1_NUMERATOR,HAI_1_SIR,HAI_2_CILOWER,HAI_2_CIUPPER,HAI_2_DOPC,...,PSI_06,PSI_08,PSI_09,PSI_10,PSI_11,PSI_12,PSI_13,PSI_14,PSI_15,PSI_90
0,10001,0.289,1.307,10024,10.597,7,0.661,0.140,0.570,17731,...,0.167117,0.095461,2.330697,0.609540,8.923957,3.328297,5.980974,0.649012,1.209750,1.005236
1,10005,1.791,6.741,3713,2.45,9,3.673,0.487,2.498,8670,...,0.173883,0.057663,2.075276,0.756613,6.869380,2.542284,3.440264,0.762277,0.866989,0.908322
2,10006,0.307,1.575,7318,7.924,6,0.757,0.050,0.534,11755,...,0.261801,0.046622,3.457041,0.651895,3.893786,2.798522,3.721339,0.675144,1.334372,1.099465
3,10007,Not Available,Not Available,268,0.165,0,Not Available,Not Available,Not Available,1417,...,0.183425,0.072035,2.366388,0.914303,6.005962,3.799432,4.048369,Not Available,1.018783,0.993102
4,10008,Not Available,Not Available,14,0.008,0,Not Available,Not Available,Not Available,488,...,0.185437,0.073861,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available


In [335]:
measure.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4813 entries, 0 to 4812
Data columns (total 48 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Facility ID      4813 non-null   object
 1   HAI_1_CILOWER    4813 non-null   object
 2   HAI_1_CIUPPER    4813 non-null   object
 3   HAI_1_DOPC       4813 non-null   object
 4   HAI_1_ELIGCASES  4813 non-null   object
 5   HAI_1_NUMERATOR  4813 non-null   object
 6   HAI_1_SIR        4813 non-null   object
 7   HAI_2_CILOWER    4813 non-null   object
 8   HAI_2_CIUPPER    4813 non-null   object
 9   HAI_2_DOPC       4813 non-null   object
 10  HAI_2_ELIGCASES  4813 non-null   object
 11  HAI_2_NUMERATOR  4813 non-null   object
 12  HAI_2_SIR        4813 non-null   object
 13  HAI_3_CILOWER    4813 non-null   object
 14  HAI_3_CIUPPER    4813 non-null   object
 15  HAI_3_DOPC       4813 non-null   object
 16  HAI_3_ELIGCASES  4813 non-null   object
 17  HAI_3_NUMERATOR  4813 non-null   

In [336]:
measure.dtypes

Measure ID
Facility ID        object
HAI_1_CILOWER      object
HAI_1_CIUPPER      object
HAI_1_DOPC         object
HAI_1_ELIGCASES    object
HAI_1_NUMERATOR    object
HAI_1_SIR          object
HAI_2_CILOWER      object
HAI_2_CIUPPER      object
HAI_2_DOPC         object
HAI_2_ELIGCASES    object
HAI_2_NUMERATOR    object
HAI_2_SIR          object
HAI_3_CILOWER      object
HAI_3_CIUPPER      object
HAI_3_DOPC         object
HAI_3_ELIGCASES    object
HAI_3_NUMERATOR    object
HAI_3_SIR          object
HAI_4_CILOWER      object
HAI_4_CIUPPER      object
HAI_4_DOPC         object
HAI_4_ELIGCASES    object
HAI_4_NUMERATOR    object
HAI_4_SIR          object
HAI_5_CILOWER      object
HAI_5_CIUPPER      object
HAI_5_DOPC         object
HAI_5_ELIGCASES    object
HAI_5_NUMERATOR    object
HAI_5_SIR          object
HAI_6_CILOWER      object
HAI_6_CIUPPER      object
HAI_6_DOPC         object
HAI_6_ELIGCASES    object
HAI_6_NUMERATOR    object
HAI_6_SIR          object
PSI_03             object
P

In [337]:
col = measure.columns.to_list()
col.remove('Facility ID')
for c in col:
    measure[c] = pd.to_numeric(measure[c],errors='coerce')
measure['Facility ID'] = pd.to_numeric(measure['Facility ID'],errors='coerce')
measure.dropna(subset=['Facility ID'],inplace=True)
measure.drop_duplicates(subset=['Facility ID'],inplace=True)
measure = measure.astype({'Facility ID':'int64'})
types = {c:'float64' for c in col}
measure = measure.astype(types)


In [338]:
cost = pd.read_csv('Hospital_Metrics\Hospital_Cost_Report_2019.csv', 
                         usecols=['Provider CCN','City', 'State Code', 'Zip Code',
                                'Rural Versus Urban','CCN Facility Type',
                                'Fiscal Year Begin Date', 'Fiscal Year End Date',
                                'Number of Interns and Residents (FTE)', 'Number of Beds',
                                'Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders',
                                'Total Discharges (V + XVIII + XIX + Unknown)',
                                'Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds',
                                'Hospital Number of Beds For Adults &amp; Peds',
                                'Hospital Total Bed Days Available For Adults &amp; Peds',
                                'Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds'
                                ])
cost.head()

Unnamed: 0,Provider CCN,City,State Code,Zip Code,Rural Versus Urban,CCN Facility Type,Fiscal Year Begin Date,Fiscal Year End Date,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds
0,40018,VAN BUREN,AR,72956,U,STH,11/01/2018,12/31/2018,,39.0,168.0,481.0,444.0,31.0,1891.0,168.0
1,100044,STUART,FL,34995,U,STH,10/01/2018,12/31/2018,,521.0,26952.0,31535.0,26412.0,447.0,41124.0,26952.0
2,450090,GAINSVILLE,TX,76240-,R,STH,10/01/2018,11/30/2018,,36.0,175.0,5205.0,534.0,31.0,1708.0,175.0
3,40055,FORT SMITH,AR,72901,U,STH,11/01/2018,12/31/2018,4.26,320.0,2077.0,10556.0,9015.0,289.0,17629.0,2077.0
4,201302,BOOTHBAY HARBOR,ME,04538-,R,CAH,10/01/2018,12/31/2018,,25.0,382.0,19312.0,1377.0,21.0,7665.0,382.0


In [339]:
cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6118 entries, 0 to 6117
Data columns (total 16 columns):
 #   Column                                                                       Non-Null Count  Dtype  
---  ------                                                                       --------------  -----  
 0   Provider CCN                                                                 6118 non-null   int64  
 1   City                                                                         6118 non-null   object 
 2   State Code                                                                   6118 non-null   object 
 3   Zip Code                                                                     6118 non-null   object 
 4   Rural Versus Urban                                                           6049 non-null   object 
 5   CCN Facility Type                                                            6118 non-null   object 
 6   Fiscal Year Begin Date                  

In [340]:
cost[['Fiscal Year Begin Date', 'Fiscal Year End Date']]= cost[['Fiscal Year Begin Date', 'Fiscal Year End Date']].apply(pd.to_datetime)
cost['Zip Code'] = cost['Zip Code'].str[:5]
cat = ['City','State Code','Zip Code', 'Rural Versus Urban', 'CCN Facility Type']
types = {c:'category' for c in cat}
cost = cost.astype(types)
cost.head()

Unnamed: 0,Provider CCN,City,State Code,Zip Code,Rural Versus Urban,CCN Facility Type,Fiscal Year Begin Date,Fiscal Year End Date,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds
0,40018,VAN BUREN,AR,72956,U,STH,2018-11-01,2018-12-31,,39.0,168.0,481.0,444.0,31.0,1891.0,168.0
1,100044,STUART,FL,34995,U,STH,2018-10-01,2018-12-31,,521.0,26952.0,31535.0,26412.0,447.0,41124.0,26952.0
2,450090,GAINSVILLE,TX,76240,R,STH,2018-10-01,2018-11-30,,36.0,175.0,5205.0,534.0,31.0,1708.0,175.0
3,40055,FORT SMITH,AR,72901,U,STH,2018-11-01,2018-12-31,4.26,320.0,2077.0,10556.0,9015.0,289.0,17629.0,2077.0
4,201302,BOOTHBAY HARBOR,ME,4538,R,CAH,2018-10-01,2018-12-31,,25.0,382.0,19312.0,1377.0,21.0,7665.0,382.0


In [341]:
cost['Time Period'] = cost['Fiscal Year End Date'] - cost['Fiscal Year Begin Date']
cost.head()

Unnamed: 0,Provider CCN,City,State Code,Zip Code,Rural Versus Urban,CCN Facility Type,Fiscal Year Begin Date,Fiscal Year End Date,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Time Period
0,40018,VAN BUREN,AR,72956,U,STH,2018-11-01,2018-12-31,,39.0,168.0,481.0,444.0,31.0,1891.0,168.0,60 days
1,100044,STUART,FL,34995,U,STH,2018-10-01,2018-12-31,,521.0,26952.0,31535.0,26412.0,447.0,41124.0,26952.0,91 days
2,450090,GAINSVILLE,TX,76240,R,STH,2018-10-01,2018-11-30,,36.0,175.0,5205.0,534.0,31.0,1708.0,175.0,60 days
3,40055,FORT SMITH,AR,72901,U,STH,2018-11-01,2018-12-31,4.26,320.0,2077.0,10556.0,9015.0,289.0,17629.0,2077.0,60 days
4,201302,BOOTHBAY HARBOR,ME,4538,R,CAH,2018-10-01,2018-12-31,,25.0,382.0,19312.0,1377.0,21.0,7665.0,382.0,91 days


In [342]:
columns_to_convert = ['Total Discharges (V + XVIII + XIX + Unknown)', 'Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders',
                      'Hospital Total Bed Days Available For Adults &amp; Peds',
                      'Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds',
                      'Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds']

for column in columns_to_convert:
    cost[column] = cost[column] * (pd.Timedelta('365 days') / cost['Time Period'])

In [343]:
cost.head()

Unnamed: 0,Provider CCN,City,State Code,Zip Code,Rural Versus Urban,CCN Facility Type,Fiscal Year Begin Date,Fiscal Year End Date,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Time Period
0,40018,VAN BUREN,AR,72956,U,STH,2018-11-01,2018-12-31,,39.0,1022.0,2926.083333,2701.0,31.0,11503.583333,1022.0,60 days
1,100044,STUART,FL,34995,U,STH,2018-10-01,2018-12-31,,521.0,108104.175824,126486.538462,105938.241758,447.0,164947.912088,108104.175824,91 days
2,450090,GAINSVILLE,TX,76240,R,STH,2018-10-01,2018-11-30,,36.0,1064.583333,31663.75,3248.5,31.0,10390.333333,1064.583333,60 days
3,40055,FORT SMITH,AR,72901,U,STH,2018-11-01,2018-12-31,4.26,320.0,12635.083333,64215.666667,54841.25,289.0,107243.083333,12635.083333,60 days
4,201302,BOOTHBAY HARBOR,ME,4538,R,CAH,2018-10-01,2018-12-31,,25.0,1532.197802,77460.21978,5523.131868,21.0,30744.230769,1532.197802,91 days


In [344]:
cost_measure = measure.merge(cost,how='inner',left_on='Facility ID',right_on='Provider CCN')

In [345]:
cost_measure.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4668 entries, 0 to 4667
Data columns (total 65 columns):
 #   Column                                                                       Non-Null Count  Dtype          
---  ------                                                                       --------------  -----          
 0   Facility ID                                                                  4668 non-null   int64          
 1   HAI_1_CILOWER                                                                1896 non-null   float64        
 2   HAI_1_CIUPPER                                                                2098 non-null   float64        
 3   HAI_1_DOPC                                                                   3736 non-null   float64        
 4   HAI_1_ELIGCASES                                                              3736 non-null   float64        
 5   HAI_1_NUMERATOR                                                              3736 non-null

In [346]:
cost_measure['Facility ID'].nunique() 

4595

In [347]:
df = df.astype({'ccn':'int64'})
final = df.merge(right=cost_measure,how='inner',left_on='ccn',right_on='Facility ID')
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12266 entries, 0 to 12265
Data columns (total 69 columns):
 #   Column                                                                       Non-Null Count  Dtype          
---  ------                                                                       --------------  -----          
 0   billing_type                                                                 12266 non-null  object         
 1   billing_code                                                                 12266 non-null  object         
 2   negotiated_rates                                                             12266 non-null  float64        
 3   ccn                                                                          12266 non-null  int64          
 4   Facility ID                                                                  12266 non-null  int64          
 5   HAI_1_CILOWER                                                                9280 non-nu

In [348]:
final.describe()

Unnamed: 0,negotiated_rates,ccn,Facility ID,HAI_1_CILOWER,HAI_1_CIUPPER,HAI_1_DOPC,HAI_1_ELIGCASES,HAI_1_NUMERATOR,HAI_1_SIR,HAI_2_CILOWER,...,Provider CCN,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Time Period
count,12266.0,12266.0,12266.0,9280.0,9842.0,12035.0,12035.0,12035.0,9842.0,9745.0,...,12266.0,6064.0,12266.0,12266.0,12266.0,12254.0,12254.0,12254.0,12266.0,12266
mean,3244.517782,185144.097913,185144.097913,0.521524,1.993169,12107.750478,12.556499,12.206647,0.977725,0.401496,...,185144.097913,123.892549,336.744415,17740.207698,102317.564451,66250.351717,272.228252,98853.110898,17740.207698,361 days 01:21:14.351866948
std,48042.677932,149006.174367,149006.174367,0.445988,1.167791,15704.893894,17.370191,19.722546,0.687075,0.331791,...,149006.174367,180.711458,405.22049,21808.537927,120720.762499,85128.275756,327.849085,119622.999221,21808.537927,29 days 05:10:05.954698610
min,0.49,10001.0,10001.0,0.004,0.403,4.0,0.001,0.0,0.0,0.003,...,10001.0,0.08,8.0,73.0,449.0,203.557692,8.0,2928.0,73.0,61 days 00:00:00
25%,34.44,60075.0,60075.0,0.214,1.271,2705.0,2.152,1.0,0.548,0.155,...,60075.0,16.28,109.0,5055.851648,32414.807692,16079.0,94.0,32574.244505,5055.851648,364 days 00:00:00
50%,118.635,100157.0,100157.0,0.42,1.633,6763.0,6.356,5.0,0.851,0.315,...,100157.0,34.48,224.0,12257.0,64402.445055,42248.75,192.0,69500.412088,12257.0,364 days 00:00:00
75%,737.2,290041.0,290041.0,0.768,2.378,14447.0,15.166,14.0,1.257,0.596,...,290041.0,166.05,396.0,22766.0,118028.0,82277.417582,320.0,117972.786885,22766.0,365 days 00:00:00
max,999999.99,670076.0,670076.0,4.081,11.146,89310.0,95.144,118.0,5.869,1.794,...,670076.0,1233.83,2735.0,131197.445055,815529.326923,538474.271978,2245.0,821676.167582,131197.445055,365 days 00:00:00


In [349]:
final.drop(columns=['Facility ID', 'Provider CCN', 'Time Period'], inplace=True)
final['Procedure'] = final['billing_type'] + ' ' + final['billing_code']


In [350]:
col = final.columns.to_list()
col

['billing_type',
 'billing_code',
 'negotiated_rates',
 'ccn',
 'HAI_1_CILOWER',
 'HAI_1_CIUPPER',
 'HAI_1_DOPC',
 'HAI_1_ELIGCASES',
 'HAI_1_NUMERATOR',
 'HAI_1_SIR',
 'HAI_2_CILOWER',
 'HAI_2_CIUPPER',
 'HAI_2_DOPC',
 'HAI_2_ELIGCASES',
 'HAI_2_NUMERATOR',
 'HAI_2_SIR',
 'HAI_3_CILOWER',
 'HAI_3_CIUPPER',
 'HAI_3_DOPC',
 'HAI_3_ELIGCASES',
 'HAI_3_NUMERATOR',
 'HAI_3_SIR',
 'HAI_4_CILOWER',
 'HAI_4_CIUPPER',
 'HAI_4_DOPC',
 'HAI_4_ELIGCASES',
 'HAI_4_NUMERATOR',
 'HAI_4_SIR',
 'HAI_5_CILOWER',
 'HAI_5_CIUPPER',
 'HAI_5_DOPC',
 'HAI_5_ELIGCASES',
 'HAI_5_NUMERATOR',
 'HAI_5_SIR',
 'HAI_6_CILOWER',
 'HAI_6_CIUPPER',
 'HAI_6_DOPC',
 'HAI_6_ELIGCASES',
 'HAI_6_NUMERATOR',
 'HAI_6_SIR',
 'PSI_03',
 'PSI_06',
 'PSI_08',
 'PSI_09',
 'PSI_10',
 'PSI_11',
 'PSI_12',
 'PSI_13',
 'PSI_14',
 'PSI_15',
 'PSI_90',
 'City',
 'State Code',
 'Zip Code',
 'Rural Versus Urban',
 'CCN Facility Type',
 'Fiscal Year Begin Date',
 'Fiscal Year End Date',
 'Number of Interns and Residents (FTE)',
 'Number o

In [351]:
col_work = ['ccn', 
            'Procedure',
            'State Code',
            'Rural Versus Urban',
            'PSI_03',
            'PSI_06',
            'PSI_08',
            'PSI_09',
            'PSI_10',
            'PSI_11',
            'PSI_12',
            'PSI_13',
            'PSI_14',
            'PSI_15',
            'PSI_90',
            'HAI_1_SIR',
            'HAI_2_SIR',
            'HAI_3_SIR',
            'HAI_4_SIR',
            'HAI_5_SIR',
            'HAI_6_SIR',
            'Number of Interns and Residents (FTE)',
            'Number of Beds',
            'Total Discharges (V + XVIII + XIX + Unknown)',
            'Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders',
            'Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds',
            'Hospital Number of Beds For Adults &amp; Peds',
            'Hospital Total Bed Days Available For Adults &amp; Peds',
            'Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds',
            'negotiated_rates']

df = final[col_work]
df.head()

Unnamed: 0,ccn,Procedure,State Code,Rural Versus Urban,PSI_03,PSI_06,PSI_08,PSI_09,PSI_10,PSI_11,...,HAI_6_SIR,Number of Interns and Residents (FTE),Number of Beds,Total Discharges (V + XVIII + XIX + Unknown),Total Days (V + XVIII + XIX + Unknown) + Total for all Subproviders,Hospital Total Days (V + XVIII + XIX + Unknown) For Adults &amp; Peds,Hospital Number of Beds For Adults &amp; Peds,Hospital Total Bed Days Available For Adults &amp; Peds,Hospital Total Discharges (V + XVIII + XIX + Unknown) For Adults &amp; Peds,negotiated_rates
0,290003,CPT 36556,NV,U,0.13266,0.184709,0.114236,1.56662,0.650623,9.818181,...,0.109,31.44,604.0,32823.928571,207129.478022,126462.472527,424.0,140216.153846,32823.928571,221.3
1,290003,CPT 36556,NV,U,0.13266,0.184709,0.114236,1.56662,0.650623,9.818181,...,0.109,31.44,604.0,32823.928571,207129.478022,126462.472527,424.0,140216.153846,32823.928571,309.82
2,290003,CPT 36556,NV,U,0.13266,0.184709,0.114236,1.56662,0.650623,9.818181,...,0.109,31.44,604.0,32823.928571,207129.478022,126462.472527,424.0,140216.153846,32823.928571,273.5
3,290003,CPT 36556,NV,U,0.13266,0.184709,0.114236,1.56662,0.650623,9.818181,...,0.109,31.44,604.0,32823.928571,207129.478022,126462.472527,424.0,140216.153846,32823.928571,106.14
4,290003,CPT 36556,NV,U,0.13266,0.184709,0.114236,1.56662,0.650623,9.818181,...,0.109,31.44,604.0,32823.928571,207129.478022,126462.472527,424.0,140216.153846,32823.928571,171.56


In [355]:
df.info()
df = df.astype({'ccn':'category', 'Procedure':'category'})

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12266 entries, 0 to 12265
Data columns (total 30 columns):
 #   Column                                                                       Non-Null Count  Dtype   
---  ------                                                                       --------------  -----   
 0   ccn                                                                          12266 non-null  int64   
 1   Procedure                                                                    12266 non-null  object  
 2   State Code                                                                   12266 non-null  category
 3   Rural Versus Urban                                                           12266 non-null  category
 4   PSI_03                                                                       11476 non-null  float64 
 5   PSI_06                                                                       11476 non-null  float64 
 6   PSI_08                        

In [357]:
procedure = df.groupby(['Procedure'])
sns.boxplot

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027D1AA087F0>