In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load dataset [cite: 12]
df = pd.read_csv('../datasets/MEDICALAPPOINTMENTNOSHOWS.csv')

# 2. Identify missing values [cite: 12]
missing_counts = df.isnull().sum()
print("Initial Missing Values:")
print(missing_counts)

# 3. Visualize missing data (FIXED: Added check to prevent IndexError) [cite: 13]
# The error occurred because the list was empty; this 'if' statement prevents it.
if missing_counts.sum() > 0:
    missing_counts[missing_counts > 0].plot(kind='barh', color='salmon')
    plt.title('Missing Values in Medical Appointments')
    plt.xlabel('Count')
    plt.show()
else:
    print("\n[INFO] No missing values found in this dataset. Plot skipped.")

# 4. Apply Imputation (if needed) [cite: 14, 15]
# Even if zero nulls now, this code ensures future data is handled
for col in df.columns:
    if df[col].dtype == 'object':
        # Categorical: Mode [cite: 15]
        if not df[col].mode().empty:
            df[col] = df[col].fillna(df[col].mode()[0])
    else:
        # Numerical: Median [cite: 14]
        df[col] = df[col].fillna(df[col].median())

# 5. Compare before vs after quality [cite: 18]
print("\nCleaning Validation:")
print(f"Total Missing Values Now: {df.isnull().sum().sum()}")
print(f"Dataset Quality: 100% Complete")

# 6. Save final outcome 
df.to_csv('../datasets/Cleaned_Medical_Appointments.csv', index=False)
print("File saved as Cleaned_Medical_Appointments.csv")

Initial Missing Values:
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
NoShow            0
dtype: int64

[INFO] No missing values found in this dataset. Plot skipped.

Cleaning Validation:
Total Missing Values Now: 0
Dataset Quality: 100% Complete
File saved as Cleaned_Medical_Appointments.csv
