# Part 5: Exploratory Analysis

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import date, datetime, timedelta
import glob
from ast import literal_eval
from collections import Counter
from tqdm.auto import tqdm
import pyarrow as pa

In [2]:
drg = pd.read_parquet('D://Vignesh/Capstone/combined/drg_parquet/drg.parquet', engine='pyarrow')
cms = pd.read_csv('Hospital_Metrics/Medicare_Inpatient_Hospital_by_Provider_and_Service_2020.csv', encoding='windows-1252')

In [3]:
drg.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,MS-DRG,1,135000.0,50327
1,MS-DRG,2,135000.0,50327
2,MS-DRG,3,9000.0,50327
3,MS-DRG,4,9000.0,50327
4,MS-DRG,5,135095.0,50327


In [4]:
drg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9777 entries, 0 to 9776
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   billing_type      9777 non-null   object 
 1   billing_code      9777 non-null   object 
 2   negotiated_rates  9777 non-null   float64
 3   ccn               9777 non-null   object 
dtypes: float64(1), object(3)
memory usage: 305.7+ KB


In [5]:
cms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165281 entries, 0 to 165280
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Rndrng_Prvdr_CCN           165281 non-null  int64  
 1   Rndrng_Prvdr_Org_Name      165281 non-null  object 
 2   Rndrng_Prvdr_St            165281 non-null  object 
 3   Rndrng_Prvdr_City          165281 non-null  object 
 4   Rndrng_Prvdr_State_Abrvtn  165281 non-null  object 
 5   Rndrng_Prvdr_State_FIPS    165281 non-null  int64  
 6   Rndrng_Prvdr_Zip5          165281 non-null  int64  
 7   Rndrng_Prvdr_RUCA          165281 non-null  float64
 8   Rndrng_Prvdr_RUCA_Desc     165281 non-null  object 
 9   DRG_Cd                     165281 non-null  int64  
 10  DRG_Desc                   165281 non-null  object 
 11  Tot_Dschrgs                165281 non-null  int64  
 12  Avg_Submtd_Cvrd_Chrg       165281 non-null  float64
 13  Avg_Tot_Pymt_Amt           16

In [6]:
drg = drg.astype({'billing_code': 'str','ccn':'int64'})
drg['billing_code'] = drg.apply(lambda row: row['billing_code'] if row['billing_code'].isdigit() else None, axis=1)

drg.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,MS-DRG,1,135000.0,50327
1,MS-DRG,2,135000.0,50327
2,MS-DRG,3,9000.0,50327
3,MS-DRG,4,9000.0,50327
4,MS-DRG,5,135095.0,50327


In [7]:
drg.dropna(subset=['billing_code'],inplace=True)
drg = drg.astype({'billing_code': 'int64'})

In [8]:
merged = drg.merge(cms, how='inner', left_on=['ccn','billing_code'], right_on=['Rndrng_Prvdr_CCN','DRG_Cd'])

In [9]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 799 entries, 0 to 798
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   billing_type               799 non-null    object 
 1   billing_code               799 non-null    int64  
 2   negotiated_rates           799 non-null    float64
 3   ccn                        799 non-null    int64  
 4   Rndrng_Prvdr_CCN           799 non-null    int64  
 5   Rndrng_Prvdr_Org_Name      799 non-null    object 
 6   Rndrng_Prvdr_St            799 non-null    object 
 7   Rndrng_Prvdr_City          799 non-null    object 
 8   Rndrng_Prvdr_State_Abrvtn  799 non-null    object 
 9   Rndrng_Prvdr_State_FIPS    799 non-null    int64  
 10  Rndrng_Prvdr_Zip5          799 non-null    int64  
 11  Rndrng_Prvdr_RUCA          799 non-null    float64
 12  Rndrng_Prvdr_RUCA_Desc     799 non-null    object 
 13  DRG_Cd                     799 non-null    int64  

In [10]:
merged.head(10)

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn,Rndrng_Prvdr_CCN,Rndrng_Prvdr_Org_Name,Rndrng_Prvdr_St,Rndrng_Prvdr_City,Rndrng_Prvdr_State_Abrvtn,Rndrng_Prvdr_State_FIPS,Rndrng_Prvdr_Zip5,Rndrng_Prvdr_RUCA,Rndrng_Prvdr_RUCA_Desc,DRG_Cd,DRG_Desc,Tot_Dschrgs,Avg_Submtd_Cvrd_Chrg,Avg_Tot_Pymt_Amt,Avg_Mdcr_Pymt_Amt
0,MS-DRG,3,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,3,"""ECMO OR TRACHEOSTOMY WITH MV >96 HOURS OR PRI...",23,1090380.0,211395.52174,179499.56522
1,MS-DRG,4,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,4,"TRACH W MV >96 HRS OR PDX EXC FACE, MOUTH & NE...",18,946924.1,153933.11111,138166.72222
2,MS-DRG,5,135095.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,5,LIVER TRANSPLANT WITH MCC OR INTESTINAL TRANSP...,20,1083387.0,143484.3,129179.25
3,MS-DRG,11,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,11,"TRACHEOSTOMY FOR FACE, MOUTH & NECK DIAGNOSES ...",12,394920.2,68277.75,59581.916667
4,MS-DRG,23,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,23,CRANIOTOMY W MAJOR DEVICE IMPLANT OR ACUTE COM...,21,382940.8,66471.142857,59667.809524
5,MS-DRG,25,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,25,CRANIOTOMY & ENDOVASCULAR INTRACRANIAL PROCEDU...,26,361781.9,50147.307692,45107.153846
6,MS-DRG,27,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,27,CRANIOTOMY & ENDOVASCULAR INTRACRANIAL PROCEDU...,21,163565.6,26104.333333,23466.095238
7,MS-DRG,64,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,64,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,43,198208.9,26364.906977,21787.767442
8,MS-DRG,65,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,65,INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...,30,81484.13,11613.433333,9173.8
9,MS-DRG,70,9000.0,50327,50327,Loma Linda University Medical Center,11234 Anderson St,Loma Linda,CA,6,92354,1.0,Metropolitan area core: primary flow within an...,70,NONSPECIFIC CEREBROVASCULAR DISORDERS W MCC,19,136101.9,20190.473684,15949.315789


In [11]:
merged['ccn'].nunique()

15

| Code Type | Code Value | Description | Associated CMS Metric | Metric Details
| --- | --- | --- | --- | --- |
| CPT | 36556 | Under Insertion of Central Venous Access Device | HAI_1 | Central Line Associated Bloodstream Infection
| CPT | 51701 | Under Introduction Procedures on the Bladder | HAI_2 | Catheter Associated Urinary Tract Infections
| CPT | 51702 | Under Introduction Procedures on the Bladder | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | A4314 | Insertion tray with drainage bag with indwelling catheter, Foley type, 2-way latex with coating (Teflon, silicone, silicone elastomer or hydrophilic, etc.) | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | A4315 | Insertion tray with drainage bag with indwelling catheter, Foley type, 2-way, all silicone | HAI_2 | Catheter Associated Urinary Tract Infections
| HCPCS | G9312 | Surgical site infection | HAI_3 | Surgical Site Infection - Colon Surgery
| CPT | 58150 | Under Hysterectomy Procedures | HAI_4 | Surgical Site Infection - Abdominal Hysterectomy
| CPT | 15920 | Under Pressure Ulcers (Decubitus Ulcers) Procedures | PSI-3 | Pressure Ulcer Rate
| CPT | 35800 | Under Repair, Excision, Exploration, Revision Procedures on Arteries and Veins | PSI-9 | Postoperative hemorrhage or hematoma rate
| HCPCS | J1650 | Injection, enoxaparin sodium, 10 mg | PSI-12 | Perioperative pulmonary embolism or deep vein thrombosis rate
| HCPCS | C9604 | Percutaneous transluminal revascularization of or through coronary artery bypass graft (internal mammary, free arterial, venous), any combination of drug-eluting intracoronary stent, atherectomy and angioplasty, including distal protection when performed | MORT_30_CABG | Death rate for CABG surgery patients


PSI 90 is a composite metric of all PSI scores (patient safety scores). Since it should be highly correlated with all surgeries, I think it would be appropriate to add for CCN availible.

In [12]:
code_values = [['CPT','36556'],['CPT','51701'],['CPT','51702'],['HCPCS','A4314'],
        ['HCPCS','A4315'],['HCPCS','G9312'],['CPT','58150'],['CPT','15920'],
        ['CPT','35800'],['HCPCS','J1650'],['HCPCS','C9604']]

cpt = pd.read_parquet('D:\Vignesh\Capstone\combined\cpt_parquet\cpt.parquet')
hcpcs = pd.read_parquet('D:\Vignesh\Capstone\combined\hcpcs_parquet\hcpcs.parquet')

In [13]:
cpt.head()

Unnamed: 0,billing_type,billing_code,negotiated_rates,ccn
0,CPT,0001A,16.94,290003
1,CPT,0001U,576.0,290003
2,CPT,0002A,28.39,290003
3,CPT,0002M,402.72,290003
4,CPT,0002U,20.0,290003


In [14]:
cpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14771907 entries, 0 to 14771906
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   billing_type      object 
 1   billing_code      object 
 2   negotiated_rates  float64
 3   ccn               object 
dtypes: float64(1), object(3)
memory usage: 450.8+ MB


In [15]:
df = pd.DataFrame(columns=['billing_type','billing_code','negotiated_rates','ccn'])

for [code_type, value] in code_values:
    if code_type == 'CPT':
        temp = cpt[cpt['billing_code'] == value]
        df = pd.concat([df,temp], axis=0, ignore_index=True)
    else:
        temp = hcpcs[hcpcs['billing_code'] == value]
        df = pd.concat([df,temp], axis=0, ignore_index=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12530 entries, 0 to 12529
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   billing_type      12530 non-null  object 
 1   billing_code      12530 non-null  object 
 2   negotiated_rates  12530 non-null  float64
 3   ccn               12530 non-null  object 
dtypes: float64(1), object(3)
memory usage: 391.7+ KB


In [17]:
infections = pd.read_csv('Hospital_Metrics\Healthcare_Associated_Infections-Hospital.csv', 
                         usecols=['Facility ID','Measure ID','Score'])

In [18]:
infections.head()

Unnamed: 0,Facility ID,Measure ID,Score
0,10001,HAI_1_CILOWER,0.289
1,10001,HAI_1_CIUPPER,1.307
2,10001,HAI_1_DOPC,10024.0
3,10001,HAI_1_ELIGCASES,10.597
4,10001,HAI_1_NUMERATOR,7.0


In [19]:
inf_piv = infections.pivot(index='Facility ID',columns='Measure ID', values='Score').reset_index().rename_axis(None,1)
inf_piv.head()

Unnamed: 0_level_0,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score
Measure ID,HAI_1_CILOWER,HAI_1_CIUPPER,HAI_1_DOPC,HAI_1_ELIGCASES,HAI_1_NUMERATOR,HAI_1_SIR,HAI_2_CILOWER,HAI_2_CIUPPER,HAI_2_DOPC,HAI_2_ELIGCASES,...,HAI_5_DOPC,HAI_5_ELIGCASES,HAI_5_NUMERATOR,HAI_5_SIR,HAI_6_CILOWER,HAI_6_CIUPPER,HAI_6_DOPC,HAI_6_ELIGCASES,HAI_6_NUMERATOR,HAI_6_SIR
Facility ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10001,0.289,1.307,10024,10.597,7,0.661,0.140,0.570,17731,26.63,...,101908,9.412,8,0.850,0.492,0.868,101451,72.686,48,0.660
10005,1.791,6.741,3713,2.45,9,3.673,0.487,2.498,8670,4.995,...,38413,1.999,0,0.000,0.419,1.575,35686,10.484,9,0.858
10006,0.307,1.575,7318,7.924,6,0.757,0.050,0.534,11755,15.296,...,62709,4.164,6,1.441,0.015,0.292,54159,22.618,2,0.088
10007,Not Available,Not Available,268,0.165,0,Not Available,Not Available,Not Available,1417,0.774,...,5484,0.132,0,Not Available,0.023,2.296,5413,2.148,1,0.466
10008,Not Available,Not Available,14,0.008,0,Not Available,Not Available,Not Available,488,0.265,...,2171,0.051,0,Not Available,Not Available,Not Available,2171,0.398,0,Not Available


In [20]:
inf_piv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4848 entries, 010001 to 670314
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   (Score, HAI_1_CILOWER)    4848 non-null   object
 1   (Score, HAI_1_CIUPPER)    4848 non-null   object
 2   (Score, HAI_1_DOPC)       4848 non-null   object
 3   (Score, HAI_1_ELIGCASES)  4848 non-null   object
 4   (Score, HAI_1_NUMERATOR)  4848 non-null   object
 5   (Score, HAI_1_SIR)        4848 non-null   object
 6   (Score, HAI_2_CILOWER)    4848 non-null   object
 7   (Score, HAI_2_CIUPPER)    4848 non-null   object
 8   (Score, HAI_2_DOPC)       4848 non-null   object
 9   (Score, HAI_2_ELIGCASES)  4848 non-null   object
 10  (Score, HAI_2_NUMERATOR)  4848 non-null   object
 11  (Score, HAI_2_SIR)        4848 non-null   object
 12  (Score, HAI_3_CILOWER)    4848 non-null   object
 13  (Score, HAI_3_CIUPPER)    4848 non-null   object
 14  (Score, HAI_3_DOPC)   

In [24]:
column = inf_piv.columns
column

MultiIndex([('Score',   'HAI_1_CILOWER'),
            ('Score',   'HAI_1_CIUPPER'),
            ('Score',      'HAI_1_DOPC'),
            ('Score', 'HAI_1_ELIGCASES'),
            ('Score', 'HAI_1_NUMERATOR'),
            ('Score',       'HAI_1_SIR'),
            ('Score',   'HAI_2_CILOWER'),
            ('Score',   'HAI_2_CIUPPER'),
            ('Score',      'HAI_2_DOPC'),
            ('Score', 'HAI_2_ELIGCASES'),
            ('Score', 'HAI_2_NUMERATOR'),
            ('Score',       'HAI_2_SIR'),
            ('Score',   'HAI_3_CILOWER'),
            ('Score',   'HAI_3_CIUPPER'),
            ('Score',      'HAI_3_DOPC'),
            ('Score', 'HAI_3_ELIGCASES'),
            ('Score', 'HAI_3_NUMERATOR'),
            ('Score',       'HAI_3_SIR'),
            ('Score',   'HAI_4_CILOWER'),
            ('Score',   'HAI_4_CIUPPER'),
            ('Score',      'HAI_4_DOPC'),
            ('Score', 'HAI_4_ELIGCASES'),
            ('Score', 'HAI_4_NUMERATOR'),
            ('Score',       'HAI_4

In [21]:
safety = pd.read_csv('Hospital_Metrics\CMS_PSI_6_decimal_file.csv', 
                         usecols=['Facility ID','Measure ID','Rate'])

In [22]:
safety.head()

Unnamed: 0,Facility ID,Measure ID,Rate
0,10001,PSI_03,0.231881
1,10001,PSI_06,0.167117
2,10001,PSI_08,0.095461
3,10001,PSI_09,2.330697
4,10001,PSI_10,0.60954


In [47]:
saf_piv = safety.pivot(index='Facility ID',columns='Measure ID')
saf_piv.head()

Unnamed: 0_level_0,Rate,Rate,Rate,Rate,Rate,Rate,Rate,Rate,Rate,Rate,Rate
Measure ID,PSI_03,PSI_06,PSI_08,PSI_09,PSI_10,PSI_11,PSI_12,PSI_13,PSI_14,PSI_15,PSI_90
Facility ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
10001,0.231881,0.167117,0.095461,2.330697,0.609540,8.923957,3.328297,5.980974,0.649012,1.209750,1.005236
10005,0.859076,0.173883,0.057663,2.075276,0.756613,6.869380,2.542284,3.440264,0.762277,0.866989,0.908322
10006,1.829317,0.261801,0.046622,3.457041,0.651895,3.893786,2.798522,3.721339,0.675144,1.334372,1.099465
10007,0.319664,0.183425,0.072035,2.366388,0.914303,6.005962,3.799432,4.048369,Not Available,1.018783,0.993102
10008,0.498143,0.185437,0.073861,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
