In [1]:
# preprocessing.py

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# -------------------------------
# 1️⃣ Load Dataset
# -------------------------------
df = pd.read_csv("../data/diabetic_data.csv")
print("Original shape:", df.shape)

# -------------------------------
# 2️⃣ Remove fully empty rows (not all NaNs)
# -------------------------------
df.dropna(how='all', inplace=True)

# -------------------------------
# 3️⃣ Handle Missing Values
# -------------------------------

# Fill numeric columns with median
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode safely
for col in df.select_dtypes(include=['object']).columns:
    if df[col].dropna().shape[0] > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

# -------------------------------
# 4️⃣ Encode Categorical Columns
# -------------------------------
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# -------------------------------
# 5️⃣ Drop Duplicates (optional)
# -------------------------------
df.drop_duplicates(inplace=True)

# -------------------------------
# 6️⃣ Final Check
# -------------------------------
print("Cleaned shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nUnique target values (if readmitted present):")
if 'readmitted' in df.columns:
    print(df['readmitted'].unique())

# -------------------------------
# 7️⃣ Save Cleaned Data
# -------------------------------
df.to_csv("cleaned_patient_data.csv", index=False)
print("\n✅ Preprocessing complete! Cleaned file saved as 'cleaned_patient_data.csv'")


Original shape: (101766, 50)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Cleaned shape: (101766, 50)
Columns: ['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']

Unique target values (if readmitted present):
[2 1 0]

✅ Preprocessing complete! Cleaned file saved as 'cleaned_patient_data.csv'
