In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load dataset
df = pd.read_csv('../data/diabetic_data.csv')
print("Shape of data:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

Shape of data: (101766, 50)
Columns: ['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   

In [4]:
# Check for missing values
print(df.isnull().sum())

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [5]:
# Drop irrelevant columns (e.g., 'encounter_id', 'patient_nbr')
df = df.drop(['encounter_id', 'patient_nbr'], axis=1)

In [6]:
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

In [7]:
# Handle missing values
df.dropna(inplace=True)

In [8]:
# Convert target variable: readmitted (">30" or "<30" -> 1, "NO" -> 0)
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x in ['>30', '<30'] else 0)

print(df['readmitted'].value_counts())

Series([], Name: count, dtype: int64)


In [9]:
# Save cleaned data
df.to_csv('../data/cleaned_data.csv', index=False)
print("Cleaned data saved!")

Cleaned data saved!
