## Purpose of this notebook
Merge the SERVICES and CPTEVENTS table, which contain the services and procedures for which the patients were billed, respectively.

In [20]:
import pandas as pd
# load the services table
services_df = pd.read_csv('../../data/raw/SERVICES.csv')
services_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73343 entries, 0 to 73342
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ROW_ID        73343 non-null  int64 
 1   SUBJECT_ID    73343 non-null  int64 
 2   HADM_ID       73343 non-null  int64 
 3   TRANSFERTIME  73343 non-null  object
 4   PREV_SERVICE  14668 non-null  object
 5   CURR_SERVICE  73343 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.4+ MB


In [21]:
# the column we need is the CURR_SERVICE, let's see how many categories are there
service_count = services_df.groupby('CURR_SERVICE')['ROW_ID'].count()
service_count = service_count.sort_values(ascending = False)
service_count

CURR_SERVICE
MED      24866
CMED      9135
CSURG     7911
NB        7806
SURG      5514
NSURG     4100
TRAUM     3137
NMED      2957
OMED      2286
VSURG     1455
TSURG     1338
ORTHO     1101
GU         436
NBB        346
GYN        306
PSURG      283
ENT        247
OBS        113
DENT         5
PSYCH        1
Name: ROW_ID, dtype: int64

In [22]:
# Note, some services were very infrequent, so most likely will not have any effect. I will keep them for now and
# eliminate all columns with few entries once I have the whole dataset compiled.
# Since the number of services rows is greater than the number of admissions, it follows that some admissions 
# correspond to multiple services. Crosstabulate hospital admission ids and services
crosstab_serv = pd.crosstab(services_df['HADM_ID'], services_df['CURR_SERVICE'])
len(crosstab_serv)

58926

In [23]:
crosstab_serv.columns

Index(['CMED', 'CSURG', 'DENT', 'ENT', 'GU', 'GYN', 'MED', 'NB', 'NBB', 'NMED',
       'NSURG', 'OBS', 'OMED', 'ORTHO', 'PSURG', 'PSYCH', 'SURG', 'TRAUM',
       'TSURG', 'VSURG'],
      dtype='object', name='CURR_SERVICE')

In [24]:
# It would be better to keep track of which categorical variables belong together
serv_new_colnames = ['CURR_SERVICE_' + column for column in crosstab_serv.columns]
serv_new_colnames

['CURR_SERVICE_CMED',
 'CURR_SERVICE_CSURG',
 'CURR_SERVICE_DENT',
 'CURR_SERVICE_ENT',
 'CURR_SERVICE_GU',
 'CURR_SERVICE_GYN',
 'CURR_SERVICE_MED',
 'CURR_SERVICE_NB',
 'CURR_SERVICE_NBB',
 'CURR_SERVICE_NMED',
 'CURR_SERVICE_NSURG',
 'CURR_SERVICE_OBS',
 'CURR_SERVICE_OMED',
 'CURR_SERVICE_ORTHO',
 'CURR_SERVICE_PSURG',
 'CURR_SERVICE_PSYCH',
 'CURR_SERVICE_SURG',
 'CURR_SERVICE_TRAUM',
 'CURR_SERVICE_TSURG',
 'CURR_SERVICE_VSURG']

In [25]:
# now rename the columns
crosstab_serv.columns = serv_new_colnames
crosstab_serv.columns

Index(['CURR_SERVICE_CMED', 'CURR_SERVICE_CSURG', 'CURR_SERVICE_DENT',
       'CURR_SERVICE_ENT', 'CURR_SERVICE_GU', 'CURR_SERVICE_GYN',
       'CURR_SERVICE_MED', 'CURR_SERVICE_NB', 'CURR_SERVICE_NBB',
       'CURR_SERVICE_NMED', 'CURR_SERVICE_NSURG', 'CURR_SERVICE_OBS',
       'CURR_SERVICE_OMED', 'CURR_SERVICE_ORTHO', 'CURR_SERVICE_PSURG',
       'CURR_SERVICE_PSYCH', 'CURR_SERVICE_SURG', 'CURR_SERVICE_TRAUM',
       'CURR_SERVICE_TSURG', 'CURR_SERVICE_VSURG'],
      dtype='object')

In [28]:
# load the procedures column
cpt_df = pd.read_csv('../../data/raw/CPTEVENTS.csv', low_memory=False)
cpt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573146 entries, 0 to 573145
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ROW_ID            573146 non-null  int64  
 1   SUBJECT_ID        573146 non-null  int64  
 2   HADM_ID           573146 non-null  int64  
 3   COSTCENTER        573146 non-null  object 
 4   CHARTDATE         101545 non-null  object 
 5   CPT_CD            573146 non-null  object 
 6   CPT_NUMBER        573128 non-null  float64
 7   CPT_SUFFIX        22 non-null      object 
 8   TICKET_ID_SEQ     471601 non-null  float64
 9   SECTIONHEADER     573125 non-null  object 
 10  SUBSECTIONHEADER  573125 non-null  object 
 11  DESCRIPTION       101545 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 52.5+ MB


In [29]:
# the columns we need are sectionheader (higher level procedure category) and subsectionheader (more detailed 
# procedure category). The two variables are redundant, however, since we don't know at the moment which will be 
# useful, extract both columns.
# crosstabulate as with services
crosstab_cpt_sect = pd.crosstab(cpt_df['HADM_ID'], cpt_df['SECTIONHEADER'])
crosstab_cpt_sect.columns = ['SECTIONHEADER_' + column for column in crosstab_cpt_sect.columns]
crosstab_cpt_sect.columns

Index(['SECTIONHEADER_Anesthesia', 'SECTIONHEADER_Emerging technology',
       'SECTIONHEADER_Evaluation and management', 'SECTIONHEADER_Medicine',
       'SECTIONHEADER_Pathology and laboratory', 'SECTIONHEADER_Radiology',
       'SECTIONHEADER_Surgery'],
      dtype='object')

In [30]:
crosstab_cpt_subsect = pd.crosstab(cpt_df['HADM_ID'], cpt_df['SUBSECTIONHEADER'])
crosstab_cpt_subsect.columns = ['SUBSECTIONHEADER_' + column for column in crosstab_cpt_subsect.columns]
crosstab_cpt_subsect.columns

Index(['SUBSECTIONHEADER_Bone/joint studies',
       'SUBSECTIONHEADER_Cardiovascular',
       'SUBSECTIONHEADER_Cardiovascular system',
       'SUBSECTIONHEADER_Case management services',
       'SUBSECTIONHEADER_Central nervous system assessments/tests (neuro-cognitive, mental status, speech testing)',
       'SUBSECTIONHEADER_Chemistry',
       'SUBSECTIONHEADER_Confirmatory consultation for a new or established patient (deleted codes)',
       'SUBSECTIONHEADER_Conscious sedation (deleted codes)',
       'SUBSECTIONHEADER_Consultations',
       'SUBSECTIONHEADER_Critical care services',
       'SUBSECTIONHEADER_Diagnostic imaging',
       'SUBSECTIONHEADER_Diagnostic ultrasound', 'SUBSECTIONHEADER_Dialysis',
       'SUBSECTIONHEADER_Digestive system',
       'SUBSECTIONHEADER_End-Stage Renal Disease Services (deleted codes)',
       'SUBSECTIONHEADER_Endocrine system',
       'SUBSECTIONHEADER_Eye and ocular adnexa',
       'SUBSECTIONHEADER_Female genital system',
       'SUBSECTI

In [31]:
import numpy as np
# first look at the shape of each dataframe
print("The shape of crosstabulated services is ", crosstab_serv.values.shape)
print("The shape of crosstabulated procedure sectionheaders is ", crosstab_cpt_sect.values.shape)
print("The shape of crosstabulated procedure subsectionheaders is ", crosstab_cpt_subsect.values.shape)

The shape of crosstabulated services is  (58926, 20)
The shape of crosstabulated procedure sectionheaders is  (44148, 7)
The shape of crosstabulated procedure subsectionheaders is  (44148, 54)


In [33]:
# some admissions are missing procedures information, combine the dataframes for now
crosstab_serv = pd.merge(crosstab_serv, crosstab_cpt_sect, left_index=True, right_index=True, how='left')
crosstab_serv = pd.merge(crosstab_serv, crosstab_cpt_subsect, left_index=True, right_index=True, how='left')
assert crosstab_serv.values.shape == (58926, 81), "something went wrong"

In [19]:
crosstab_serv.reset_index(inplace=True)
# save the intermediate dataset
crosstab_serv.to_csv('../../data/intermediate/)

(58926, 81)