#Initial descrition of the following variables:

 time_in_hospital, payer_code, medical_specialty, num_lab_procedures, num_procedures, num_medications, number_outpatient, number_emergency, number_inpatient

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('readmission.csv')

##**time_in_hospital**: Number of days between admission and discharge (int64)


---

*   min = 1; max = 14
*   No missing values
*   Ready to use!

In [4]:
df['time_in_hospital'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 94564 entries, 0 to 94563
Series name: time_in_hospital
Non-Null Count  Dtype
--------------  -----
94564 non-null  int64
dtypes: int64(1)
memory usage: 738.9 KB


In [5]:
df['time_in_hospital'].unique()

array([ 1,  3,  2,  4,  5, 13, 12,  9,  7, 10,  6, 11,  8, 14])

In [6]:
df['time_in_hospital'].min(), df['time_in_hospital'].max(), df['time_in_hospital'].isnull().sum()

(1, 14, 0)

##**payer_code**: Unique identifier (key) that indicates who is paying (object). Includes self-pay


---


*   Missing values are noted as '?'
*   There are 40,256 missing values. (40%)
*   With so many missing values we might want to drop this column.











In [7]:
df['payer_code'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 94564 entries, 0 to 94563
Series name: payer_code
Non-Null Count  Dtype 
--------------  ----- 
94564 non-null  object
dtypes: object(1)
memory usage: 738.9+ KB


In [8]:
df['payer_code'].unique()

array(['?', 'MC', 'MD', 'HM', 'UN', 'BC', 'SP', 'CP', 'SI', 'DM', 'CM',
       'CH', 'PO', 'WC', 'OT', 'OG', 'MP', 'FR'], dtype=object)

In [9]:
#Missing values:
len(df['payer_code'][df['payer_code'] == '?']), len(df['payer_code'][df['payer_code'] == '?'])/len(df)

(39263, 0.4152002876358868)

##medical_specialty: specialty of the admitting doc (object)


---

*   Missing values are noted as '?'
*   There are 40,256 missing values. (49%)
*   With so many missing values we might want to drop this column.


In [10]:
df['medical_specialty'].unique()

array(['Pediatrics-Endocrinology', '?', 'InternalMedicine',
       'Family/GeneralPractice', 'Cardiology', 'Surgery-General',
       'Orthopedics', 'Gastroenterology',
       'Surgery-Cardiovascular/Thoracic', 'Nephrology',
       'Orthopedics-Reconstructive', 'Psychiatry', 'Emergency/Trauma',
       'Pulmonology', 'Surgery-Neuro',
       'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology',
       'Pediatrics', 'Hematology/Oncology', 'Otolaryngology',
       'Surgery-Colon&Rectal', 'Pediatrics-CriticalCare', 'Endocrinology',
       'Urology', 'Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology',
       'Neurology', 'Anesthesiology-Pediatric', 'Radiology',
       'Pediatrics-Hematology-Oncology', 'Psychology', 'Podiatry',
       'Gynecology', 'Oncology', 'Pediatrics-Neurology',
       'Surgery-Plastic', 'Surgery-Thoracic',
       'Surgery-PlasticwithinHeadandNeck', 'Ophthalmology',
       'Surgery-Pediatric', 'Pediatrics-EmergencyMedicine',
       'PhysicalMedicineandRe

In [11]:
#Missing values:
len(df['medical_specialty'][df['medical_specialty'] == '?']), len(df['medical_specialty'][df['medical_specialty'] == '?'])/len(df)

(45037, 0.4762594644896578)

##num_lab_procedures: Number of lab tests performed during stay (int64)


---


*   min = 1; max = 132
*   No missing values
*   Ready to use!




In [12]:
df['num_lab_procedures'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 94564 entries, 0 to 94563
Series name: num_lab_procedures
Non-Null Count  Dtype  
--------------  -----  
94563 non-null  float64
dtypes: float64(1)
memory usage: 738.9 KB


In [13]:
df['num_lab_procedures'].unique()

array([ 41.,  59.,  11.,  44.,  51.,  31.,  70.,  73.,  68.,  33.,  47.,
        62.,  60.,  55.,  49.,  75.,  45.,  29.,  35.,  42.,  66.,  36.,
        19.,  64.,  25.,  53.,  52.,  87.,  27.,  37.,  46.,  28.,  48.,
        72.,  10.,   2.,  65.,  67.,  40.,  54.,  58.,  57.,  43.,  32.,
        83.,  34.,  39.,  69.,  38.,  56.,  22.,  96.,  78.,  61.,  88.,
        50.,   1.,  18.,  82.,   9.,  63.,  24.,  71.,  77.,  81.,  76.,
        90.,  93.,   3., 103.,  13.,  80.,  85.,  16.,  15.,  12.,  30.,
        23.,  17.,  21.,  79.,  26.,   5.,  95.,  97.,  84.,  14.,  74.,
       105.,  86.,  98.,  20.,   6.,  94.,   8., 102., 100.,   7.,  89.,
        91.,  92.,   4., 101.,  99., 114., 113., 111., 129., 107., 108.,
       106., 104., 109., 120., 132., 121., 126.,  nan])

In [14]:
df['num_lab_procedures'].min(), df['num_lab_procedures'].max(), df['num_lab_procedures'].isnull().sum()

(1.0, 132.0, 1)

##num_procedures: Number of procedures (besides lab tests) performed during the stay (int64)


---



*   min = 0; max = 6
*   No missing values
*   Ready to use!



In [15]:
df['num_procedures'].unique()

array([ 0.,  5.,  1.,  6.,  2.,  3.,  4., nan])

In [16]:
df['num_procedures'].min(), df['num_procedures'].max(), df['num_procedures'].isnull().sum()

(0.0, 6.0, 1)

##num_medications: Number of distinct generic meds given during stay (int64)


---



*   min = 1; max = 81
*   No missing values
*   Ready to used!



In [17]:
df['num_medications'].unique()

array([ 1., 18., 13., 16.,  8., 21., 12., 28., 17., 11., 15., 31.,  2.,
       23., 19.,  7., 20., 14., 10., 22.,  9., 27., 25.,  4., 32.,  6.,
       30., 26., 24., 33.,  5., 39.,  3., 29., 61., 40., 46., 41., 36.,
       34., 35., 50., 43., 42., 37., 51., 38., 45., 54., 52., 49., 62.,
       55., 47., 44., 53., 48., 57., 59., 56., 60., 63., 58., 70., 67.,
       64., 69., 65., 68., 66., 81., 79., 75., 72., 74., nan])

In [18]:
df['num_medications'].min(), df['num_medications'].max(), df['num_medications'].isnull().sum()

(1.0, 81.0, 1)

##number_outpatient: Number of outpatient(service/treatment that does not require hospitalization) visits the patient had in the year preceding the stay (int64)


---



*   min = 0; max = 42
*   No missing values
*   Ready to use!







In [19]:
df['number_outpatient'].unique()

array([ 0.,  2.,  1.,  5.,  7.,  9.,  3.,  8.,  4., 12., 11.,  6., 20.,
       15., 10., 13., 14., 16., 21., 35., 17., 29., 36., 18., 19., 27.,
       22., 24., 42., 39., 34., 26., 33., 25., 23., 28., 37., nan])

In [20]:
df['number_outpatient'].min(), df['number_outpatient'].max(), df['number_outpatient'].isnull().sum()

(0.0, 42.0, 1)

##number_emergency: Number of emergency visits of the patient preceding the stay (int64)


---



*   min = 0; max = 76
*   No missing values
*   Ready to use!




In [21]:
df['number_emergency'].unique()

array([ 0.,  1.,  2.,  4.,  3.,  9.,  5.,  7.,  6.,  8., 22., 25., 10.,
       13., 42., 16., 11., 28., 15., 14., 18., 12., 21., 20., 19., 46.,
       76., 37., 64., 63., 54., nan])

In [22]:
df['number_emergency'].min(), df['number_emergency'].max(), df['number_emergency'].isnull().sum()

(0.0, 76.0, 1)

##number_inpatient: Number of inpatient visits (required hospitalization) in the year preceding the stay (int64)


---



*   min = 0; max = 21
*   No missing values
*   Ready to use!







In [23]:
df['number_inpatient'].unique()

array([ 0.,  1.,  2.,  3.,  6.,  5.,  4.,  7.,  8.,  9., 15., 10., 11.,
       14., 12., 13., 17., 16., 21., 18., 19., nan])

In [24]:
df['number_inpatient'].min(), df['number_inpatient'].max(), df['number_inpatient'].isnull().sum()

(0.0, 21.0, 1)