In [1]:
# Healthcare Data Cleaning and Preparation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Setup plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

print("üè• Healthcare Data Cleaning")
print("=" * 50)

# Load data
print("üìÅ Loading data...")
patients = pd.read_csv('../../data/raw/healthcare_patients.csv')
treatments = pd.read_csv('../../data/raw/healthcare_treatments.csv')
facilities = pd.read_csv('../../data/raw/healthcare_facilities.csv')

print(f"Patients: {patients.shape}")
print(f"Treatments: {treatments.shape}")
print(f"Facilities: {facilities.shape}")

# Data Overview
print("\nüîç Data Overview:")
print("\nPatients DataFrame Info:")
print(patients.info())
print("\nFirst 5 rows of Patients:")
print(patients.head())

print("\nTreatments DataFrame Info:")
print(treatments.info())
print("\nFirst 5 rows of Treatments:")
print(treatments.head())

print("\nFacilities DataFrame Info:")
print(facilities.info())
print("\nFirst 5 rows of Facilities:")
print(facilities.head())

# Check for missing values
print("\nüîé Missing Values Analysis:")
print("Patients missing values:")
print(patients.isnull().sum())

print("\nTreatments missing values:")
print(treatments.isnull().sum())

print("\nFacilities missing values:")
print(facilities.isnull().sum())

# Data Cleaning Functions
def clean_patients_data(df):
    """Clean patients dataset"""
    df_clean = df.copy()

    # Convert dates
    df_clean["admission_date"] = pd.to_datetime(df_clean["admission_date"])
    df_clean["discharge_date"] = pd.to_datetime(df_clean["discharge_date"])

    # Handle any date inconsistencies
    mask = df_clean["discharge_date"] < df_clean["admission_date"]
    if mask.any():
        print(
            f"‚ö†Ô∏è  Found {mask.sum()} records with discharge before admission. Fixing..."
        )
        df_clean.loc[mask, "discharge_date"] = df_clean.loc[
            mask, "admission_date"
        ] + pd.Timedelta(days=1)

    # Calculate length of stay
    df_clean["length_of_stay"] = (
        df_clean["discharge_date"] - df_clean["admission_date"]
    ).dt.days

    # Remove negative length of stay
    df_clean = df_clean[df_clean["length_of_stay"] >= 0]

    # Age validation
    df_clean = df_clean[(df_clean["age"] >= 0) & (df_clean["age"] <= 120)]

    return df_clean


def clean_treatments_data(df):
    """Clean treatments dataset"""
    df_clean = df.copy()

    # Convert date
    df_clean["treatment_date"] = pd.to_datetime(df_clean["treatment_date"])

    # Cost validation - remove extreme outliers
    Q1 = df_clean["cost"].quantile(0.25)
    Q3 = df_clean["cost"].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_clean = df_clean[
        (df_clean["cost"] >= lower_bound) & (df_clean["cost"] <= upper_bound)
    ]

    # Duration validation
    df_clean = df_clean[df_clean["duration_days"] > 0]

    return df_clean


def clean_facilities_data(df):
    """Clean facilities dataset"""
    df_clean = df.copy()

    # Bed capacity validation
    df_clean = df_clean[df_clean["bed_capacity"] > 0]

    # Established year validation
    current_year = datetime.now().year
    df_clean = df_clean[
        (df_clean["established_year"] >= 1900)
        & (df_clean["established_year"] <= current_year)
    ]

    return df_clean

# Apply cleaning
print("üßπ Cleaning data...")
patients_clean = clean_patients_data(patients)
treatments_clean = clean_treatments_data(treatments)
facilities_clean = clean_facilities_data(facilities)

print(f"‚úÖ Cleaning complete!")
print(
    f"Patients: {len(patients_clean)} (removed {len(patients) - len(patients_clean)})"
)
print(
    f"Treatments: {len(treatments_clean)} (removed {len(treatments) - len(treatments_clean)})"
)
print(
    f"Facilities: {len(facilities_clean)} (removed {len(facilities) - len(facilities_clean)})"
)

# Data Validation
print("\nüìä Data Validation Summary:")

print(f"\nPatients:")
print(
    f"‚Ä¢ Age range: {patients_clean['age'].min()} - {patients_clean['age'].max()} years"
)
print(
    f"‚Ä¢ Length of stay: {patients_clean['length_of_stay'].min()} - {patients_clean['length_of_stay'].max()} days"
)
print(f"‚Ä¢ Readmission rate: {patients_clean['readmission_30_days'].mean()*100:.1f}%")

print(f"\nTreatments:")
print(
    f"‚Ä¢ Cost range: ${treatments_clean['cost'].min():.2f} - ${treatments_clean['cost'].max():.2f}"
)
print(
    f"‚Ä¢ Duration range: {treatments_clean['duration_days'].min()} - {treatments_clean['duration_days'].max()} days"
)

print(f"\nFacilities:")
print(
    f"‚Ä¢ Bed capacity: {facilities_clean['bed_capacity'].min()} - {facilities_clean['bed_capacity'].max()} beds"
)
print(f"‚Ä¢ Facility types: {facilities_clean['facility_type'].nunique()} types")

# Save cleaned data
print("\nüíæ Saving cleaned data...")
patients_clean.to_csv("../../data/processed/patients_clean.csv", index=False)
treatments_clean.to_csv("../../data/processed/treatments_clean.csv", index=False)
facilities_clean.to_csv("../../data/processed/facilities_clean.csv", index=False)

print("‚úÖ Cleaned data saved to ../data/processed/")

# Create a merged dataset for analysis
merged_data = patients_clean.merge(treatments_clean, on="patient_id", how="left")
merged_data = merged_data.merge(facilities_clean, on="facility_id", how="left")
merged_data.to_csv("../../data/processed/healthcare_merged.csv", index=False)
print("‚úÖ Merged dataset created for analysis")
# Final Summary
print("\nüéØ Healthcare Data Cleaning Complete!")
print("=" * 50)
print("Next steps:")
print("1. Run 02_eda.ipynb for exploratory data analysis")
print("2. Execute healthcare_queries.sql for SQL analysis")
print("3. Build insights in 03_patient_analytics.ipynb")
print("4. Launch dashboard with streamlit run dashboards/streamlit_app.py")





üè• Healthcare Data Cleaning
üìÅ Loading data...
Patients: (2000, 12)
Treatments: (5000, 9)
Facilities: (50, 8)

üîç Data Overview:

Patients DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   patient_id           2000 non-null   object
 1   age                  2000 non-null   int64 
 2   gender               2000 non-null   object
 3   blood_type           2000 non-null   object
 4   primary_condition    2000 non-null   object
 5   admission_date       2000 non-null   object
 6   facility_id          2000 non-null   object
 7   insurance_type       2000 non-null   object
 8   severity             2000 non-null   int64 
 9   discharge_date       2000 non-null   object
 10  length_of_stay       2000 non-null   int64 
 11  readmission_30_days  2000 non-null   int64 
dtypes: int64(4), object(8)
memory usage: 187.