In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [26]:
# Load datasets
train_df = pd.read_csv("/Users/thomassimmons/Downloads/healthcare/train_data.csv")
test_df = pd.read_csv("/Users/thomassimmons/Downloads/healthcare/test_data.csv")

In [27]:
# Combine for consistent preprocessing
train_df['is_train'] = 1 
test_df['is_train'] = 0
full_df = pd.concat([train_df, test_df], ignore_index=True)

In [28]:
# Make columns consistent
full_df.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay', 'is_train'],
      dtype='object')

In [29]:
full_df.columns = full_df.columns.str.strip().str.lower().str.replace(" ", "_")

In [30]:
# Fill missing numerical values with median
full_df.isnull().sum()

num_cols = ['bed_grade', 'city_code_patient']

for col in num_cols:
    median_val = full_df[col].median()
    full_df[col].fillna(median_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_df[col].fillna(median_val, inplace=True)


In [31]:
full_df.isnull().sum()


case_id                                   0
hospital_code                             0
hospital_type_code                        0
city_code_hospital                        0
hospital_region_code                      0
available_extra_rooms_in_hospital         0
department                                0
ward_type                                 0
ward_facility_code                        0
bed_grade                                 0
patientid                                 0
city_code_patient                         0
type_of_admission                         0
severity_of_illness                       0
visitors_with_patient                     0
age                                       0
admission_deposit                         0
stay                                 137057
is_train                                  0
dtype: int64

In [32]:
cat_cols = ['type_of_admission', 'severity_of_illness']

for col in cat_cols:
    mode_val = full_df[col].mode()[0]
    full_df[col].fillna(mode_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_df[col].fillna(mode_val, inplace=True)


In [33]:
# Encode categorical features using LabelEncoder
label_enc_cols = [
    'hospital_type_code', 'department', 'ward_type', 'ward_facility_code',
    'type_of_admission', 'severity_of_illness', 'age', 'hospital_region_code'
]

label_encoders = {}
for col in label_enc_cols:
    le = LabelEncoder()
    full_df[col] = le.fit_transform(full_df[col].astype(str))
    label_encoders[col] = le  # Store encoders for reverse transformation if needed

# Split back into train and test
clean_train_df = full_df[full_df['is_train'] == 1].drop(columns=['is_train'])
clean_test_df = full_df[full_df['is_train'] == 0].drop(columns=['is_train', 'stay'])  # no target in test


In [35]:
clean_test_df

Unnamed: 0,case_id,hospital_code,hospital_type_code,city_code_hospital,hospital_region_code,available_extra_rooms_in_hospital,department,ward_type,ward_facility_code,bed_grade,patientid,city_code_patient,type_of_admission,severity_of_illness,visitors_with_patient,age,admission_deposit
318438,318439,21,2,3,2,3,2,3,0,2.0,17006,2.0,0,2,2,7,3095.0
318439,318440,29,0,4,0,2,2,3,5,2.0,17006,2.0,1,2,4,7,4018.0
318440,318441,26,1,2,1,3,2,1,3,4.0,17006,2.0,0,2,3,7,4492.0
318441,318442,6,0,6,0,3,2,1,5,2.0,17006,2.0,1,2,3,7,4173.0
318442,318443,28,1,11,0,2,2,2,5,2.0,17006,2.0,1,2,4,7,4161.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455490,455491,11,1,2,1,4,1,1,3,3.0,41160,3.0,0,1,4,4,6313.0
455491,455492,25,4,1,0,2,3,2,4,4.0,30985,7.0,0,2,2,0,3510.0
455492,455493,30,2,3,2,2,1,2,0,4.0,81811,12.0,2,1,2,0,7190.0
455493,455494,5,0,1,0,2,1,2,4,4.0,57021,10.0,1,1,2,4,5435.0


In [37]:
clean_test_df.head()

Unnamed: 0,case_id,hospital_code,hospital_type_code,city_code_hospital,hospital_region_code,available_extra_rooms_in_hospital,department,ward_type,ward_facility_code,bed_grade,patientid,city_code_patient,type_of_admission,severity_of_illness,visitors_with_patient,age,admission_deposit
318438,318439,21,2,3,2,3,2,3,0,2.0,17006,2.0,0,2,2,7,3095.0
318439,318440,29,0,4,0,2,2,3,5,2.0,17006,2.0,1,2,4,7,4018.0
318440,318441,26,1,2,1,3,2,1,3,4.0,17006,2.0,0,2,3,7,4492.0
318441,318442,6,0,6,0,3,2,1,5,2.0,17006,2.0,1,2,3,7,4173.0
318442,318443,28,1,11,0,2,2,2,5,2.0,17006,2.0,1,2,4,7,4161.0
