In [None]:
# pipeline_execution.ipynb
# This notebook executes the entire data preparation pipeline by calling functions
# from the 'data_preprocessing_functions' module.

# --- 1. SETUP AND IMPORTS ---

# Import standard libraries for analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from collections import Counter

# Import all custom functions and constants from the local Python module
from data_preprocessing_functions import (
    load_data,
    replace_impossible_zeros,
    impute_missing_values,
    treat_outliers_iqr,
    feature_engineering,
    encode_features,
    scale_data,
    feature_selection_kbest,
    perform_pca,
    handle_imbalance_smote,
    FEATURES_WITH_IMPOSSIBLE_ZEROS
)

# --- 2. CONFIGURATION AND CONSTANTS ---

# Input/Output File Paths
INPUT_FILE = 'Diabetes Missing Data.csv'
OUTPUT_P1 = 'diabetes_initial_clean.csv'
OUTPUT_P2 = 'diabetes_cleaned.csv'
OUTPUT_P3 = 'diabetes_transformed_scaled.csv'
OUTPUT_P4 = 'diabetes_reduced.csv'
FINAL_OUTPUT_FILE = 'diabetes_final_balanced.csv'

# Define columns for Outlier Treatment in Phase 2 (Typically all continuous features + Pregnant)
OUTLIER_COLS = ['Pregnant', 'Glucose', 'Diastolic_BP', 'Skin_Fold', 'Serum_Insulin', 'BMI', 'Diabetes_Pedigree', 'Age']

# Define Labels for Age_Group Encoding in Phase 3
AGE_GROUP_LABELS = ['Young', 'Middle-Aged', 'Senior', 'Elderly']
K_BEST_FEATURES = 5 # Number of features to select in Phase 4

print("Pipeline Configuration Loaded.")
print(f"Features where Zero is replaced by NaN: {FEATURES_WITH_IMPOSSIBLE_ZEROS}")
print("-" * 50)

# --- 3. PHASE 1: DATA LOADING AND ZERO REPLACEMENT ---
print("--- PHASE 1: Data Loading & Initial Cleaning ---")

# Task 1: Load Data
df = load_data(INPUT_FILE)

if df.empty:
    print("FATAL ERROR: Could not load the dataset. Check file path.")
else:
    print(f"1. Dataset loaded successfully. Shape: {df.shape}")

    # Task 2: Replace impossible zeros (biologically impossible values) with NaN
    df_p1 = replace_impossible_zeros(df, FEATURES_WITH_IMPOSSIBLE_ZEROS)
    print("2. Impossible zeros replaced with NaN.")
    print("   NaN counts after replacement:")
    print(df_p1.isnull().sum())

    # Task 3: Visualize Missing Data (Jupyter-specific visualization)
    print("\n3. Missing Data Visualization (Matrix):")
    msno.matrix(df_p1, figsize=(10, 5), fontsize=10)
    plt.title("Missing Data Matrix (Post Zero-Replacement)")
    plt.show()

    # Save the processed DataFrame for the next phase
    df_p1.to_csv(OUTPUT_P1, index=False)
    print(f"4. DataFrame state saved to: {OUTPUT_P1}")
    print("-" * 50)


# --- 4. PHASE 2: DATA CLEANING (IMPUTATION AND OUTLIER TREATMENT) ---
print("--- PHASE 2: Data Cleaning (Imputation & Outlier Treatment) ---")

# Task 1: Load data from Phase 1 output
df_loaded_p2 = load_data(OUTPUT_P1)
print(f"1. Loaded data for Phase 2. Shape: {df_loaded_p2.shape}")

# Task 2: Impute missing values using the median strategy
# Note: 'median' is generally robust to outliers, which is good before outlier treatment.
df_imputed = impute_missing_values(df_loaded_p2, strategy='median')
print("2. Missing values imputed using the median strategy.")
print(f"   Check for remaining NaNs: {df_imputed.isnull().sum().sum()}")

# Task 3: Treat Outliers using the IQR Capping method
df_p2 = treat_outliers_iqr(df_imputed, OUTLIER_COLS, factor=1.5)
print("3. Outliers treated using IQR capping in specified columns.")

# Task 4: Visualize data after outlier treatment (Box Plots)
print("\n4. Box Plots of key features after Outlier Treatment:")
plt.figure(figsize=(15, 6))
for i, col in enumerate(OUTLIER_COLS[:6]): # Display first 6 treated features
    plt.subplot(2, 3, i + 1)
    sns.boxplot(y=df_p2[col])
    plt.title(f'Box Plot of {col} (Post-Cleaning)')
plt.tight_layout()
plt.show()

# Save the processed DataFrame for the next phase
df_p2.to_csv(OUTPUT_P2, index=False)
print(f"5. DataFrame state saved to: {OUTPUT_P2}")
print("-" * 50)


# --- 5. PHASE 3: DATA TRANSFORMATION (ENGINEERING, ENCODING, SCALING) ---
print("--- PHASE 3: Data Transformation ---")

# Task 1: Load data from Phase 2 output
df_loaded_p3 = load_data(OUTPUT_P2)
print(f"1. Loaded data for Phase 3. Shape: {df_loaded_p3.shape}")

# Task 2: Feature Engineering (Create Age_Group)
df_engineered = feature_engineering(df_loaded_p3)
print("2. Feature 'Age_Group' engineered successfully.")
print("   Age_Group distribution:\n", df_engineered['Age_Group'].value_counts())

# Task 3: Encode the new ordinal feature ('Age_Group')
df_encoded = encode_features(df_engineered, 'Age_Group', AGE_GROUP_LABELS)
print("3. 'Age_Group' encoded and original column dropped.")
print(f"   New columns: {df_encoded.columns.tolist()}")

# Task 4: Scale numerical data
# StandardScaler is chosen as it's less affected by the prior outlier treatment (capping)
df_p3, scaler_model = scale_data(df_encoded, scaler_type='StandardScaler')
print("4. Data scaled using StandardScaler.")
print(f"   Scaled DataFrame mean (should be ~0): {df_p3.drop(columns=['Class']).mean().mean():.4f}")
print(f"   Scaled DataFrame std (should be ~1): {df_p3.drop(columns=['Class']).std().mean():.4f}")

# Save the processed DataFrame for the next phase
df_p3.to_csv(OUTPUT_P3, index=False)
print(f"5. DataFrame state saved to: {OUTPUT_P3}")
print("-" * 50)


# --- 6. PHASE 4: DATA REDUCTION (FEATURE SELECTION/PCA) ---
print("--- PHASE 4: Data Reduction ---")

# Task 1: Load data from Phase 3 output
df_loaded_p4 = load_data(OUTPUT_P3)
print(f"1. Loaded data for Phase 4. Shape: {df_loaded_p4.shape}")

# Separate features (X) and target (y)
X = df_loaded_p4.drop('Class', axis=1)
y = df_loaded_p4['Class']

print(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")

# Task 2: Feature Selection using SelectKBest
X_fs = feature_selection_kbest(X, y, k=K_BEST_FEATURES)
selected_features = X_fs.columns.tolist()
print(f"2. SelectKBest selected {K_BEST_FEATURES} features:")
print(f"   Selected Features: {selected_features}")

# Task 3: Dimensionality Reduction using PCA (Demonstration only, not used for final save)
df_pca, pca_model = perform_pca(X, variance_threshold=0.90)
explained_variance = np.sum(pca_model.explained_variance_ratio_)
print("\n3. PCA Analysis (for context):")
print(f"   PCA reduced features to {df_pca.shape[1]} components.")
print(f"   Cumulative Explained Variance: {explained_variance*100:.2f}%")

# Create final reduced dataset (using SelectKBest features for interpretability)
df_p4 = pd.concat([X_fs, y], axis=1)
print(f"\n4. Final reduced dataset shape (SelectKBest): {df_p4.shape}")

# Save the processed DataFrame for the next phase
df_p4.to_csv(OUTPUT_P4, index=False)
print(f"5. DataFrame state saved to: {OUTPUT_P4}")
print("-" * 50)


# --- 7. PHASE 5: DATA IMBALANCE HANDLING ---
print("--- PHASE 5: Data Imbalance Handling ---")

# Task 1: Load data from Phase 4 output
df_loaded_p5 = load_data(OUTPUT_P4)
print(f"1. Loaded data for Phase 5. Shape: {df_loaded_p5.shape}")

# Separate features (X) and target (y)
X = df_loaded_p5.drop('Class', axis=1)
y = df_loaded_p5['Class']

# Initial Class Distribution
print("2. Initial Class Distribution:")
initial_counts = Counter(y)
print(initial_counts)

# Task 2: Apply SMOTE (Synthetic Minority Over-sampling Technique)
X_resampled, y_resampled = handle_imbalance_smote(X, y, random_state=42)
print("\n3. SMOTE applied successfully.")

# Final Class Distribution
print("4. Final Class Distribution after SMOTE:")
final_counts = Counter(y_resampled)
print(final_counts)

# Task 3: Visualize final class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=y_resampled, palette='viridis')
plt.title('Target Class Distribution After SMOTE')
plt.xlabel('Class (0: Non-Diabetic, 1: Diabetic)')
plt.ylabel('Count')
plt.show()

# Combine resampled features and target into the final dataset
df_final = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Class')], axis=1)
print(f"\n5. Final Balanced Dataset Shape: {df_final.shape}")
print(f"   Final Columns: {df_final.columns.tolist()}")
print("   Final DataFrame Head:")
print(df_final.head())

# Save the FINAL processed DataFrame
df_final.to_csv(FINAL_OUTPUT_FILE, index=False)
print(f"\n*** COMPLETE: FINAL PROCESSED DATASET SAVED TO: {FINAL_OUTPUT_FILE} ***")