In [None]:
```xml
<VSCode.Cell id="prep_001" language="markdown">
# 02 - Data Preprocessing & Feature Engineering

This notebook covers comprehensive data preprocessing and feature engineering steps to prepare data for model training.

## Steps Covered
1. Load and validate data
2. Handle missing values
3. Detect and treat outliers
4. Engineer domain-specific features
5. Encode categorical variables
6. Scale numerical features
7. Prepare train-test split
</VSCode.Cell>

<VSCode.Cell id="prep_002" language="python">
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import sys
sys.path.append('..')

from src.preprocessor import DataPreprocessor, FeatureEngineer
from src.data_loader import DataLoader

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
</VSCode.Cell>

<VSCode.Cell id="prep_003" language="python">
# Load and Validate Data
print("="*60)
print("STEP 1: DATA LOADING & VALIDATION")
print("="*60)

# Initialize data loader
DATA_PATH = 'data/raw/loan_applications.csv'

try:
    loader = DataLoader(DATA_PATH)
    df = loader.load_data()
    print("✓ Data loaded and validated successfully")
    
    # Display data info
    info = loader.get_data_info()
    print("\nDataset Information:")
    for key, value in info.items():
        if key != 'missing_values' and key != 'features':
            print(f"  {key}: {value}")
except Exception as e:
    print(f"✗ Error loading data: {e}")
    print("Creating sample dataset for demonstration...")
    
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'income': np.random.randint(40000, 150000, n_samples),
        'loan_amount': np.random.randint(100000, 400000, n_samples),
        'credit_score': np.random.randint(600, 850, n_samples),
        'employment_years': np.random.randint(0, 30, n_samples),
        'age': np.random.randint(25, 65, n_samples),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples),
        'marital_status': np.random.choice(['Single', 'Married', 'Divorced'], n_samples),
        'fraud_label': np.random.binomial(1, 0.25, n_samples)
    })
    print("✓ Sample dataset created for demonstration")
</VSCode.Cell>

<VSCode.Cell id="prep_004" language="python">
# Step 2: Handle Missing Values
print("\n" + "="*60)
print("STEP 2: MISSING VALUES HANDLING")
print("="*60)

# Check for missing values
missing_before = df.isnull().sum().sum()
print(f"\nMissing values before: {missing_before}")
print(f"Missing value percentage: {(missing_before / df.size * 100):.2f}%")

# Handle missing values
df_filled = DataPreprocessor.handle_missing_values(df, strategy='mean')

missing_after = df_filled.isnull().sum().sum()
print(f"\nMissing values after: {missing_after}")
print(f"✓ Missing values handled successfully")
</VSCode.Cell>

<VSCode.Cell id="prep_005" language="python">
# Step 3: Outlier Detection
print("\n" + "="*60)
print("STEP 3: OUTLIER DETECTION & ANALYSIS")
print("="*60)

# Function to detect outliers using IQR
def detect_outliers_iqr(data, feature, iqr_multiplier=1.5):
    """Detect outliers using Interquartile Range method"""
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - iqr_multiplier * IQR
    upper_bound = Q3 + iqr_multiplier * IQR
    
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Detect outliers for numerical features
numerical_features = ['income', 'loan_amount', 'credit_score', 'employment_years', 'age']

print("\nOutlier Detection Results (IQR Method):")
for feature in numerical_features:
    outliers, lower, upper = detect_outliers_iqr(df_filled, feature)
    print(f"\n{feature.upper()}:")
    print(f"  Outliers found: {len(outliers)}")
    print(f"  Bounds: [{lower:.2f}, {upper:.2f}]")
    
    # Check if outliers are fraud cases
    if len(outliers) > 0:
        fraud_rate = outliers['fraud_label'].mean() * 100
        print(f"  Fraud rate in outliers: {fraud_rate:.2f}%")

# Visualize outliers
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(numerical_features):
    ax = axes[idx]
    
    # Create box plot
    box_data = [df_filled[feature]]
    bp = ax.boxplot(box_data, vert=True, patch_artist=True)
    bp['boxes'][0].set_facecolor('#3498db')
    bp['boxes'][0].set_alpha(0.6)
    
    # Add scatter plot
    y_data = np.random.normal(1, 0.04, size=len(df_filled))
    ax.scatter(y_data, df_filled[feature], alpha=0.3, s=30)
    
    ax.set_ylabel(feature.capitalize(), fontsize=11)
    ax.set_title(f'{feature.capitalize()} - Outlier Detection', fontsize=12, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)

axes[-1].axis('off')
plt.suptitle('Outlier Detection via Box Plots', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

print("\n✓ Decision: Outliers retained (may represent legitimate extreme cases)")
</VSCode.Cell>

<VSCode.Cell id="prep_006" language="python">
# Step 4: Feature Engineering
print("\n" + "="*60)
print("STEP 4: FEATURE ENGINEERING")
print("="*60)

# Apply feature engineering
engineer = FeatureEngineer()
df_engineered = engineer.engineer_features(df_filled)

print("\nEngineered Features Created:")
engineered_features = [
    'income_to_loan_ratio',
    'credit_history_score',
    'employment_stability',
    'age_credit_interaction',
    'loan_amount_category',
    'income_category'
]

for feature in engineered_features:
    print(f"  ✓ {feature}")
    print(f"    Range: [{df_engineered[feature].min():.4f}, {df_engineered[feature].max():.4f}]")

# Show feature statistics
print("\nEngineered Features Statistics:")
print(df_engineered[engineered_features].describe().round(4))
</VSCode.Cell>

<VSCode.Cell id="prep_007" language="python">
# Visualize engineered features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(engineered_features):
    ax = axes[idx]
    
    legitimate = df_engineered[df_engineered['fraud_label'] == 0][feature]
    fraudulent = df_engineered[df_engineered['fraud_label'] == 1][feature]
    
    ax.hist(legitimate, bins=30, alpha=0.6, label='Legitimate', color='#2ecc71')
    ax.hist(fraudulent, bins=30, alpha=0.6, label='Fraudulent', color='#e74c3c')
    
    ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title(f'Distribution of {feature.replace("_", " ").title()}', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(alpha=0.3)

plt.suptitle('Engineered Features Distributions', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

print("✓ Feature engineering complete")
</VSCode.Cell>

<VSCode.Cell id="prep_008" language="python">
# Step 5: Categorical Encoding
print("\n" + "="*60)
print("STEP 5: CATEGORICAL VARIABLE ENCODING")
print("="*60)

# Prepare for encoding
df_encoded = df_engineered.copy()

categorical_features = ['education_level', 'marital_status']

print("\nCategorical Features Encoding:")
print(f"Original unique values:")
for feature in categorical_features:
    print(f"  {feature}: {df_encoded[feature].unique()}")

# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df_encoded, columns=categorical_features, drop_first=True)

print(f"\nAfter one-hot encoding:")
print(f"  Total features: {df_encoded.shape[1]}")
print(f"  New columns: {[col for col in df_encoded.columns if col not in df_engineered.columns]}")

# Display encoded data structure
print(f"\nDataFrame shape after encoding: {df_encoded.shape}")
print(f"Column names:\n{df_encoded.columns.tolist()}")
</VSCode.Cell>

<VSCode.Cell id="prep_009" language="python">
# Step 6: Feature Scaling
print("\n" + "="*60)
print("STEP 6: FEATURE SCALING & STANDARDIZATION")
print("="*60)

# Separate features and target
X = df_encoded.drop('fraud_label', axis=1)
y = df_encoded['fraud_label']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Initialize preprocessor
preprocessor = DataPreprocessor()

# Fit and transform
X_processed = preprocessor.fit_transform(X)

print("\nScaling Results:")
print(f"Before scaling - Income mean: {X['income'].mean():.2f}, std: {X['income'].std():.2f}")
print(f"After scaling - Income mean: {X_processed['income'].mean():.4f}, std: {X_processed['income'].std():.4f}")

# Show scaled data statistics
print("\nProcessed Features Statistics (After Scaling):")
print(X_processed.describe().round(4))
</VSCode.Cell>

<VSCode.Cell id="prep_010" language="python">
# Visualize scaled vs unscaled
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

features_to_show = ['income', 'loan_amount', 'credit_score', 'employment_years', 'age', 'income_to_loan_ratio']

for idx, feature in enumerate(features_to_show):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    # Unscaled
    ax.hist(X[feature], bins=30, alpha=0.5, label='Unscaled', color='#3498db')
    
    # Scaled (if available)
    if feature in X_processed.columns:
        ax.hist(X_processed[feature], bins=30, alpha=0.5, label='Scaled', color='#e74c3c')
    
    ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title(f'{feature.replace("_", " ").title()} - Before vs After Scaling', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(alpha=0.3)

plt.suptitle('Scaling Impact on Feature Distributions', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

print("✓ Scaling complete")
</VSCode.Cell>

<VSCode.Cell id="prep_011" language="python">
# Step 7: Train-Test Split
print("\n" + "="*60)
print("STEP 7: TRAIN-TEST SPLIT")
print("="*60)

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintain fraud distribution
)

print(f"\nTrain Set Size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X_processed)*100:.1f}%)")
print(f"Test Set Size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X_processed)*100:.1f}%)")

print(f"\nTrain Set Fraud Distribution:")
print(f"  Legitimate: {(y_train == 0).sum()} ({(y_train == 0).sum()/len(y_train)*100:.1f}%)")
print(f"  Fraudulent: {(y_train == 1).sum()} ({(y_train == 1).sum()/len(y_train)*100:.1f}%)")

print(f"\nTest Set Fraud Distribution:")
print(f"  Legitimate: {(y_test == 0).sum()} ({(y_test == 0).sum()/len(y_test)*100:.1f}%)")
print(f"  Fraudulent: {(y_test == 1).sum()} ({(y_test == 1).sum()/len(y_test)*100:.1f}%)")

print("\n✓ Stratified split maintains fraud distribution in both sets")
</VSCode.Cell>

<VSCode.Cell id="prep_012" language="python">
# Visualize train-test split
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

datasets = ['Train Set', 'Test Set']
fraud_counts = [(y_train == 1).sum(), (y_test == 1).sum()]
legitimate_counts = [(y_train == 0).sum(), (y_test == 0).sum()]

for idx, (dataset, fraud, legit) in enumerate(zip(datasets, fraud_counts, legitimate_counts)):
    ax = axes[idx]
    
    categories = ['Legitimate', 'Fraudulent']
    counts = [legit, fraud]
    colors = ['#2ecc71', '#e74c3c']
    
    bars = ax.bar(categories, counts, color=colors, alpha=0.7, edgecolor='black')
    ax.set_ylabel('Count', fontsize=11)
    ax.set_title(f'{dataset} Distribution', fontsize=12, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar, count in zip(bars, counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(count)}\n({count/(legit+fraud)*100:.1f}%)',
                ha='center', va='bottom', fontsize=10)

plt.suptitle('Train-Test Split: Fraud Distribution Maintained', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
</VSCode.Cell>

<VSCode.Cell id="prep_013" language="python">
# Summary and Next Steps
print("\n" + "="*60)
print("PREPROCESSING SUMMARY")
print("="*60)

summary_info = {
    'Original Dataset Size': len(df),
    'Final Dataset Size': len(X_processed),
    'Original Features': df.shape[1] - 1,  # Exclude target
    'Engineered Features': X_processed.shape[1],
    'Missing Values Handled': 'Yes',
    'Outliers Detected': 'Retained for analysis',
    'Categorical Variables Encoded': 'Yes (One-Hot)',
    'Numerical Features Scaled': 'Yes (StandardScaler)',
    'Train-Test Split Ratio': '80-20',
    'Stratification Applied': 'Yes'
}

for key, value in summary_info.items():
    print(f"{key:.<45} {value}")

print("\n✓ Data preprocessing complete!")
print("\nNext Step: Train multiple models on the preprocessed data")
print("See notebook 03_modeling.ipynb for model training and evaluation")
</VSCode.Cell>
```