In [None]:
# Heart Disease UCI Dataset - Data Preprocessing & EDA
# Comprehensive Machine Learning Pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== Heart Disease Prediction - Data Preprocessing ===")
print("Loading and analyzing the UCI Heart Disease dataset...")

# Load the dataset
# Note: You'll need to download the dataset from UCI ML Repository
# URL: https://archive.ics.uci.edu/ml/datasets/Heart+Disease
try:
    # Try loading from common locations
    df = pd.read_csv('../data/heart_disease.csv')
    print("✅ Dataset loaded successfully from data/heart_disease.csv")
except FileNotFoundError:
    try:
        df = pd.read_csv('heart.csv')
        print("✅ Dataset loaded successfully from heart.csv")
    except FileNotFoundError:
        print("❌ Dataset not found. Please download the Heart Disease UCI dataset.")
        print("Expected filename: 'heart.csv' or 'data/heart_disease.csv'")
        # Create sample dataset structure for demonstration
        print("Creating sample dataset structure for demonstration...")
        np.random.seed(42)
        n_samples = 303
        df = pd.DataFrame({
            'age': np.random.randint(29, 78, n_samples),
            'sex': np.random.choice([0, 1], n_samples),
            'cp': np.random.choice([0, 1, 2, 3], n_samples),
            'trestbps': np.random.randint(94, 200, n_samples),
            'chol': np.random.randint(126, 564, n_samples),
            'fbs': np.random.choice([0, 1], n_samples),
            'restecg': np.random.choice([0, 1, 2], n_samples),
            'thalach': np.random.randint(71, 202, n_samples),
            'exang': np.random.choice([0, 1], n_samples),
            'oldpeak': np.random.uniform(0, 6.2, n_samples),
            'slope': np.random.choice([0, 1, 2], n_samples),
            'ca': np.random.choice([0, 1, 2, 3], n_samples),
            'thal': np.random.choice([0, 1, 2, 3], n_samples),
            'target': np.random.choice([0, 1], n_samples)
        })
        print("✅ Sample dataset created for demonstration")

print(f"\nDataset shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")

# 1. INITIAL DATA EXPLORATION
print("\n" + "="*50)
print("1. INITIAL DATA EXPLORATION")
print("="*50)

print("\n📊 Dataset Info:")
print(df.info())

print("\n📈 Statistical Summary:")
print(df.describe())

print("\n🎯 Target Variable Distribution:")
print(df['target'].value_counts())
print(f"Target balance: {df['target'].value_counts(normalize=True).round(3)}")

# 2. MISSING VALUES ANALYSIS
print("\n" + "="*50)
print("2. MISSING VALUES ANALYSIS")
print("="*50)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print("\n🔍 Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")
else:
    print(f"⚠️ Total missing values: {missing_df['Missing Count'].sum()}")

# 3. EXPLORATORY DATA ANALYSIS (EDA)
print("\n" + "="*50)
print("3. EXPLORATORY DATA ANALYSIS")
print("="*50)

# Create comprehensive EDA plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Heart Disease Dataset - Exploratory Data Analysis', fontsize=16, fontweight='bold')

# Target distribution
axes[0,0].pie(df['target'].value_counts(), labels=['No Disease', 'Disease'], autopct='%1.1f%%',
              colors=['lightcoral', 'lightblue'])
axes[0,0].set_title('Target Distribution')

# Age distribution by target
sns.histplot(data=df, x='age', hue='target', bins=20, alpha=0.7, ax=axes[0,1])
axes[0,1].set_title('Age Distribution by Heart Disease')
axes[0,1].legend(labels=['No Disease', 'Disease'])

# Gender vs Heart Disease
gender_disease = pd.crosstab(df['sex'], df['target'])
gender_disease.plot(kind='bar', ax=axes[1,0], color=['lightcoral', 'lightblue'])
axes[1,0].set_title('Gender vs Heart Disease')
axes[1,0].set_xlabel('Sex (0=Female, 1=Male)')
axes[1,0].set_xticklabels(['Female', 'Male'], rotation=0)
axes[1,0].legend(labels=['No Disease', 'Disease'])

# Chest pain type distribution
cp_disease = pd.crosstab(df['cp'], df['target'])
cp_disease.plot(kind='bar', ax=axes[1,1], color=['lightcoral', 'lightblue'])
axes[1,1].set_title('Chest Pain Type vs Heart Disease')
axes[1,1].set_xlabel('Chest Pain Type')
axes[1,1].legend(labels=['No Disease', 'Disease'])

plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Box plots for numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numerical_features:
    numerical_features.remove('target')

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Box Plots for Numerical Features', fontsize=16, fontweight='bold')
axes = axes.ravel()

for i, feature in enumerate(numerical_features[:6]):
    sns.boxplot(data=df, x='target', y=feature, ax=axes[i])
    axes[i].set_title(f'{feature} by Heart Disease')
    axes[i].set_xlabel('Heart Disease (0=No, 1=Yes)')

# Hide empty subplots
for i in range(len(numerical_features[:6]), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

# 4. DATA PREPROCESSING
print("\n" + "="*50)
print("4. DATA PREPROCESSING")
print("="*50)

# Handle missing values (if any)
if missing_df['Missing Count'].sum() > 0:
    print("🔧 Handling missing values...")
    # For numerical features, use median imputation
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

    # For categorical features, use mode imputation
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    print("✅ Missing values handled")
else:
    print("✅ No missing values to handle")

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Check for categorical features that need encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

# Encode categorical features (if any)
if categorical_features:
    print("\n🔧 Encoding categorical features...")
    le = LabelEncoder()
    for feature in categorical_features:
        X[feature] = le.fit_transform(X[feature])
    print("✅ Categorical features encoded")

# Feature scaling
print("\n🔧 Scaling numerical features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("✅ Features scaled using StandardScaler")

# Alternative: MinMax scaling
print("\n🔧 Alternative: MinMax scaling...")
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)
X_minmax = pd.DataFrame(X_minmax, columns=X.columns)
print("✅ Features scaled using MinMaxScaler")

# 5. TRAIN-TEST SPLIT
print("\n" + "="*50)
print("5. TRAIN-TEST SPLIT")
print("="*50)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True).round(3)}")
print(f"Test target distribution:\n{y_test.value_counts(normalize=True).round(3)}")

# 6. FEATURE STATISTICS AFTER PREPROCESSING
print("\n" + "="*50)
print("6. FEATURE STATISTICS AFTER PREPROCESSING")
print("="*50)

print("📊 Scaled features statistics:")
print(X_scaled.describe())

# 7. SAVE PREPROCESSED DATA
print("\n" + "="*50)
print("7. SAVING PREPROCESSED DATA")
print("="*50)

# Save preprocessed data
try:
    # Create data directory if it doesn't exist
    import os
    os.makedirs('../data', exist_ok=True)

    # Save different versions of the data
    df.to_csv('data/heart_disease_cleaned.csv', index=False)
    X_scaled.to_csv('data/X_scaled.csv', index=False)
    X_minmax.to_csv('data/X_minmax.csv', index=False)
    y.to_csv('data/y.csv', index=False)

    # Save train-test splits
    X_train.to_csv('data/X_train.csv', index=False)
    X_test.to_csv('data/X_test.csv', index=False)
    y_train.to_csv('data/y_train.csv', index=False)
    y_test.to_csv('data/y_test.csv', index=False)

    print("✅ Preprocessed data saved successfully!")
    print("Files saved:")
    print("  - heart_disease_cleaned.csv")
    print("  - X_scaled.csv, X_minmax.csv")
    print("  - y.csv")
    print("  - X_train.csv, X_test.csv, y_train.csv, y_test.csv")

except Exception as e:
    print(f"⚠️ Error saving files: {e}")

# 8. DATA PREPROCESSING SUMMARY
print("\n" + "="*50)
print("8. DATA PREPROCESSING SUMMARY")
print("="*50)

print("✅ Data preprocessing completed successfully!")
print(f"📊 Original dataset shape: {df.shape}")
print(f"🎯 Target classes: {y.unique()}")
print(f"⚖️ Class balance: {dict(y.value_counts(normalize=True).round(3))}")
print(f"🔢 Number of features: {X.shape[1]}")
print(f"📈 Features scaled and ready for modeling")

print(f"\n📋 Feature list:")
for i, feature in enumerate(X.columns, 1):
    print(f"  {i:2d}. {feature}")

print(f"\n🎯 Next steps:")
print("  1. ✅ Data preprocessing complete")
print("  2. ⏳ Apply PCA for dimensionality reduction (02_pca_analysis.ipynb)")
print("  3. ⏳ Feature selection (03_feature_selection.ipynb)")
print("  4. ⏳ Train supervised learning models (04_supervised_learning.ipynb)")
print("  5. ⏳ Apply unsupervised learning (05_unsupervised_learning.ipynb)")
print("  6. ⏳ Hyperparameter tuning (06_hyperparameter_tuning.ipynb)")

print(f"\n🎉 Ready to proceed to the next phase!")