In [3]:
import pandas as pd
file_path = r"LengthOfStay.csv"
data = pd.read_csv(file_path)
print(df.head())


   eid       vdate rcount gender  dialysisrenalendstage  asthma  irondef  \
0    1   8/29/2012      0      F                      0       0        0   
1    2   5/26/2012     5+      F                      0       0        0   
2    3   9/22/2012      1      F                      0       0        0   
3    4    8/9/2012      0      F                      0       0        0   
4    5  12/20/2012      0      F                      0       0        0   

   pneum  substancedependence  psychologicaldisordermajor  ...     glucose  \
0      0                    0                           0  ...  192.476918   
1      0                    0                           0  ...   94.078507   
2      0                    0                           0  ...  130.530524   
3      0                    0                           0  ...  163.377028   
4      1                    0                           1  ...   94.886654   

   bloodureanitro  creatinine        bmi  pulse  respiration  \
0         

In [4]:
# Step 1: Checking for missing values
missing_values = data.isnull().sum()
print(missing_values)


eid                           0
vdate                         0
rcount                        0
gender                        0
dialysisrenalendstage         0
asthma                        0
irondef                       0
pneum                         0
substancedependence           0
psychologicaldisordermajor    0
depress                       0
psychother                    0
fibrosisandother              0
malnutrition                  0
hemo                          0
hematocrit                    0
neutrophils                   0
sodium                        0
glucose                       0
bloodureanitro                0
creatinine                    0
bmi                           0
pulse                         0
respiration                   0
secondarydiagnosisnonicd9     0
discharged                    0
facid                         0
lengthofstay                  0
dtype: int64


In [5]:
# Step 2: Checking for duplicates
duplicates = data.duplicated().sum()

print(duplicates)


0


In [7]:
# Step 3: Examining data types and potential inconsistencies
data_types = data.dtypes
print("Data Types :",data_types)

Data Types : eid                             int64
vdate                          object
rcount                         object
gender                         object
dialysisrenalendstage           int64
asthma                          int64
irondef                         int64
pneum                           int64
substancedependence             int64
psychologicaldisordermajor      int64
depress                         int64
psychother                      int64
fibrosisandother                int64
malnutrition                    int64
hemo                            int64
hematocrit                    float64
neutrophils                   float64
sodium                        float64
glucose                       float64
bloodureanitro                float64
creatinine                    float64
bmi                           float64
pulse                           int64
respiration                   float64
secondarydiagnosisnonicd9       int64
discharged                     object

In [10]:
import numpy as np

# Step 4: Checking for outliers in numerical columns
numerical_cols = data.select_dtypes(include=[np.number])
outliers = numerical_cols.apply(lambda x: np.sum((x < (x.quantile(0.25) - 1.5 * (x.quantile(0.75) - x.quantile(0.25)))) | 
                                                  (x > (x.quantile(0.75) + 1.5 * (x.quantile(0.75) - x.quantile(0.25))))))

print(outliers)

eid                               0
dialysisrenalendstage          3642
asthma                         3527
irondef                        9494
pneum                          3945
substancedependence            6306
psychologicaldisordermajor    23904
depress                        5166
psychother                     4939
fibrosisandother                479
malnutrition                   4948
hemo                           8000
hematocrit                     5616
neutrophils                    6515
sodium                          711
glucose                         695
bloodureanitro                19592
creatinine                      675
bmi                             685
pulse                          1054
respiration                   35167
secondarydiagnosisnonicd9      5016
lengthofstay                    132
dtype: int64


In [17]:
# Removing outliers from the numerical columns

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in numerical_cols.columns:
    data = remove_outliers(data, col)

data_cleaned_shape = data.shape

data_cleaned_shape



(24098, 28)

In [18]:
# Step 5: Examining unique values in categorical columns to identify any irregularities
categorical_cols = data.select_dtypes(include=['object'])
unique_values = {col: data[col].unique() for col in categorical_cols.columns}

print(categorical_cols)

      rcount gender facid
0          0      F     B
3          0      F     A
14         1      F     A
21         0      M     A
27         3      F     B
...      ...    ...   ...
99976      0      M     B
99978      0      M     B
99995      3      M     B
99996      0      M     B
99999      0      F     B

[24098 rows x 3 columns]


In [19]:
# Data Cleaning Steps

# Step 1: Convert 'vdate' and 'discharged' to datetime format

data['vdate'] = pd.to_datetime(data['vdate'], format='%m/%d/%Y')
data['discharged'] = pd.to_datetime(data['discharged'], format='%m/%d/%Y')

data_cleaned = data.copy()
data_cleaned.dtypes



eid                                    int64
vdate                         datetime64[ns]
rcount                                object
gender                                object
dialysisrenalendstage                  int64
asthma                                 int64
irondef                                int64
pneum                                  int64
substancedependence                    int64
psychologicaldisordermajor             int64
depress                                int64
psychother                             int64
fibrosisandother                       int64
malnutrition                           int64
hemo                                   int64
hematocrit                           float64
neutrophils                          float64
sodium                               float64
glucose                              float64
bloodureanitro                       float64
creatinine                           float64
bmi                                  float64
pulse     

In [21]:
#convert data to csv file

cleaned_file_path = 'LengthOfStay_cleaned.csv'  # Replace with your desired file path
data_cleaned.to_csv(cleaned_file_path, index=False)