### 1: Imports and Load Pruned Data

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported for Phase 3.")

# --- Load Processed Data (from Phase 2) ---
# We use the full, confirmed path from your last output
FULL_PATH_TO_PROCESSED = '/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed'

X_path = os.path.join(FULL_PATH_TO_PROCESSED, 'X_cleaned.csv')
y_path = os.path.join(FULL_PATH_TO_PROCESSED, 'y_target.csv')

print(f"Loading features from: {X_path}")
print(f"Loading target from: {y_path}")

try:
    X = pd.read_csv(X_path, index_col='PROSPECTID')
    y = pd.read_csv(y_path, index_col='PROSPECTID').squeeze('columns')
    
    print("\n--- Pruned Data Loaded Successfully ---")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")

except FileNotFoundError as e:
    print(f"\n[ERROR] File not found. Please double-check the path.")
    print(e)

Libraries imported for Phase 3.
Loading features from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_cleaned.csv
Loading target from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv

--- Pruned Data Loaded Successfully ---
X shape: (51336, 54)
y shape: (51336,)


## 2: Define Feature Lists (From Phase 2 EDA)

In [2]:
print("\n--- Defining Feature Lists based on EDA ---")

# --- 1. Categorical Features (Finding 3) ---
# Based on your expert domain knowledge
EDUCATION_MAP = [
    'OTHERS',           # 0
    'SSC',              # 1
    '12TH',             # 2
    'UNDER GRADUATE',   # 3
    'GRADUATE',         # 4
    'POST-GRADUATE',    # 5
    'PROFESSIONAL'      # 6
]
ORDINAL_FEATURE = ['EDUCATION']
NOMINAL_FEATURES = [
    'MARITALSTATUS', 
    'GENDER', 
    'last_prod_enq2', 
    'first_prod_enq2'
]

# --- 2. Numerical Features (from Missing Data Analysis) ---
# Group 3: Systematic Missingness -> Impute with 0
NUM_IMPUTE_ZERO_FEATURES = [
    'tot_enq', 'CC_enq', 'CC_enq_L6m', 'CC_enq_L12m', 'PL_enq', 
    'PL_enq_L6m', 'PL_enq_L12m', 'time_since_recent_enq', 
    'enq_L12m', 'enq_L6m', 'enq_L3m'
]

# Group 4: Minor Missingness -> Impute with Median
NUM_IMPUTE_MEDIAN_FEATURES = [
    'Age_Oldest_TL', 
    'Age_Newest_TL'
]

# --- 3. Consolidate All Lists ---
# Get all remaining numerical columns that need no imputation
all_used_cols = (
    ORDINAL_FEATURE + 
    NOMINAL_FEATURES + 
    NUM_IMPUTE_ZERO_FEATURES + 
    NUM_IMPUTE_MEDIAN_FEATURES
)

NUM_NO_IMPUTE_FEATURES = [
    col for col in X.columns 
    if col not in all_used_cols
]

print(f"Ordinal features (1): {ORDINAL_FEATURE}")
print(f"Nominal features (4): {NOMINAL_FEATURES}")
print(f"Num (impute 0) features (11): {NUM_IMPUTE_ZERO_FEATURES}")
print(f"Num (impute median) features (2): {NUM_IMPUTE_MEDIAN_FEATURES}")
print(f"Num (no impute) features ({len(NUM_NO_IMPUTE_FEATURES)}): {NUM_NO_IMPUTE_FEATURES}")
print(f"\nTotal features categorized: {1+4+11+2+len(NUM_NO_IMPUTE_FEATURES)} / {X.shape[1]}")


--- Defining Feature Lists based on EDA ---
Ordinal features (1): ['EDUCATION']
Nominal features (4): ['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
Num (impute 0) features (11): ['tot_enq', 'CC_enq', 'CC_enq_L6m', 'CC_enq_L12m', 'PL_enq', 'PL_enq_L6m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L12m', 'enq_L6m', 'enq_L3m']
Num (impute median) features (2): ['Age_Oldest_TL', 'Age_Newest_TL']
Num (no impute) features (36): ['AGE', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'pct_of_active_TLs_ever', 'pct_opened_TLs_L6m_of_L12m', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_L12m', 'pct_CC_enq_L6m_of_L12m', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag', 'Total_TL', 'Tot_Closed_TL', 'Tot_Active_TL', 'Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_open_L6M', 'pct_tl_closed_L6M', 'pct_active_tl', 'pct_closed_tl', 'Total_TL_opened_L12M', 'Tot_TL_closed_L12M', 'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'Auto_TL', 'CC_TL', 'Consum

### 3: Build Preprocessing Pipeline

In [5]:
print("\n--- Building Preprocessing Pipelines (Corrected) ---")

# --- 1. Ordinal Pipeline (Finding 3) ---
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[EDUCATION_MAP], handle_unknown='use_encoded_value', unknown_value=-1))
])

# --- 2. Nominal Pipeline (Finding 3 - CORRECTED) ---
# We add 'drop=first' to prevent the Dummy Variable Trap
# which was causing our VIFs to explode.
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')) # <-- FIX IS HERE
])

# --- 3. Numerical Pipelines (from EDA & Finding 2) ---
num_zero_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', RobustScaler())
])

num_median_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

num_no_impute_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

# --- 4. Create the Master ColumnTransformer ---
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ORDINAL_FEATURE),
        ('nominal', nominal_transformer, NOMINAL_FEATURES),
        ('num_impute_zero', num_zero_impute_transformer, NUM_IMPUTE_ZERO_FEATURES),
        ('num_impute_median', num_median_impute_transformer, NUM_IMPUTE_MEDIAN_FEATURES),
        ('num_no_impute', num_no_impute_transformer, NUM_NO_IMPUTE_FEATURES)
    ],
    remainder='passthrough'
)

print("ColumnTransformer 'preprocessor' created successfully with 'drop=first'.")


--- Building Preprocessing Pipelines (Corrected) ---
ColumnTransformer 'preprocessor' created successfully with 'drop=first'.


### 4: Apply Preprocessor & Run VIF

In [7]:
# --- 1. Apply the Preprocessing Pipeline ---
print("\n--- Applying 'preprocessor' to X ---")
X_processed = preprocessor.fit_transform(X)

# --- 2. Get Feature Names After Transformation ---
feature_names = (
    ORDINAL_FEATURE +
    preprocessor.named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(NOMINAL_FEATURES).tolist() +
    NUM_IMPUTE_ZERO_FEATURES +
    NUM_IMPUTE_MEDIAN_FEATURES +
    NUM_NO_IMPUTE_FEATURES
)

X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=X.index)

print(f"Data processed. New shape with OHE (k-1): {X_processed_df.shape}")
print("Head of processed data:")
print(X_processed_df.head())

# --- 3. Calculate VIF (Validate Finding 2) ---
print("\n--- Calculating Variance Inflation Factor (VIF) ---")

# VIF function
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    # Adding a constant column for VIF calculation is a robust practice
    df_with_const = df.copy()
    df_with_const['const'] = 1
    vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(len(df.columns))]
    return vif_data.sort_values(by='VIF', ascending=False)

# Run VIF calculation
vif_results = calculate_vif(X_processed_df)

print("VIF results (Top 15):")
print(vif_results.head(15))

print("\n\nVIF results (Bottom 15):")
print(vif_results.tail(15))


--- Applying 'preprocessor' to X ---
Data processed. New shape with OHE (k-1): (51336, 62)
Head of processed data:
            EDUCATION  MARITALSTATUS_Single  GENDER_M  last_prod_enq2_CC  \
PROSPECTID                                                                 
1                 2.0                   0.0       1.0                0.0   
2                 4.0                   1.0       0.0                0.0   
3                 1.0                   0.0       1.0                0.0   
4                 1.0                   0.0       1.0                0.0   
5                 5.0                   0.0       1.0                0.0   

            last_prod_enq2_ConsumerLoan  last_prod_enq2_HL  last_prod_enq2_PL  \
PROSPECTID                                                                      
1                                   0.0                0.0                1.0   
2                                   1.0                0.0                0.0   
3                          

In [8]:
VIF_DROP_COLS = [
    # Redundant "Total" columns (composites)
    'Total_TL',
    'Secured_TL',
    'Unsecured_TL',
    
    # Redundant "Status" columns (we are keeping "by type" instead)
    'Tot_Active_TL',
    'Tot_Closed_TL',
    
    # Redundant "Percentage" columns (ratios of the above)
    'pct_active_tl',
    'pct_closed_tl',
    'pct_of_active_TLs_ever'
]

### 5: Redefine X and Re-build the Entire Pipeline

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
print("\n--- PHASE 3: FINAL FEATURE SELECTION (Fixing Finding 2) ---")

# --- 1. Define VIF "Kill List" based on our analysis ---
VIF_DROP_COLS = [
    # Redundant "Total" columns (composites)
    'Total_TL',
    'Secured_TL',
    'Unsecured_TL',
    
    # Redundant "Status" columns (we are keeping "by type" instead)
    'Tot_Active_TL',
    'Tot_Closed_TL',
    
    # Redundant "Percentage" columns (ratios of the above)
    'pct_active_tl',
    'pct_closed_tl',
    'pct_of_active_TLs_ever'
]
print(f"Original X shape: {X.shape}")
print(f"Dropping {len(VIF_DROP_COLS)} columns due to high VIF: {VIF_DROP_COLS}")

# --- 2. Create Final Feature Set ---
X_final_features = X.drop(columns=VIF_DROP_COLS)
print(f"New X_final_features shape: {X_final_features.shape}") # Should be 46 columns

# --- 3. Re-Define Feature Lists ---
print("\n--- Re-defining Feature Lists ---")

# Categorical lists are unchanged
EDUCATION_MAP = [
    'OTHERS', 'SSC', '12TH', 'UNDER GRADUATE', 
    'GRADUATE', 'POST-GRADUATE', 'PROFESSIONAL'
]
ORDINAL_FEATURE = ['EDUCATION']
NOMINAL_FEATURES = [
    'MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'
]

# Numerical lists must be rebuilt from X_final_features
NUM_IMPUTE_ZERO_FEATURES = [
    'tot_enq', 'CC_enq', 'CC_enq_L6m', 'CC_enq_L12m', 'PL_enq', 
    'PL_enq_L6m', 'PL_enq_L12m', 'time_since_recent_enq', 
    'enq_L12m', 'enq_L6m', 'enq_L3m'
]
NUM_IMPUTE_MEDIAN_FEATURES = ['Age_Oldest_TL', 'Age_Newest_TL']

all_used_cols = (
    ORDINAL_FEATURE + NOMINAL_FEATURES + 
    NUM_IMPUTE_ZERO_FEATURES + NUM_IMPUTE_MEDIAN_FEATURES
)
NUM_NO_IMPUTE_FEATURES = [
    col for col in X_final_features.columns if col not in all_used_cols
]

print(f"Total features categorized: {len(all_used_cols) + len(NUM_NO_IMPUTE_FEATURES)} / {X_final_features.shape[1]}")

# --- 4. Re-Build Preprocessing Pipeline ---
print("\n--- Re-building Preprocessor ---")
# (Pipelines are identical, just the feature lists have changed)

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[EDUCATION_MAP], handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

num_zero_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', RobustScaler())
])

num_median_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

num_no_impute_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

# Re-build the master ColumnTransformer
final_preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ORDINAL_FEATURE),
        ('nominal', nominal_transformer, NOMINAL_FEATURES),
        ('num_impute_zero', num_zero_impute_transformer, NUM_IMPUTE_ZERO_FEATURES),
        ('num_impute_median', num_median_impute_transformer, NUM_IMPUTE_MEDIAN_FEATURES),
        ('num_no_impute', num_no_impute_transformer, NUM_NO_IMPUTE_FEATURES)
    ],
    remainder='passthrough'
)
print("Final 'final_preprocessor' created.")

# --- 5. Apply Final Preprocessor & Run VIF (Validation) ---
print("\n--- Applying Final Preprocessor & Running VIF Check ---")

# Apply the new preprocessor to the new X dataframe
X_processed_final = final_preprocessor.fit_transform(X_final_features)

# Get the new final feature names
final_feature_names = (
    ORDINAL_FEATURE +
    final_preprocessor.named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(NOMINAL_FEATURES).tolist() +
    NUM_IMPUTE_ZERO_FEATURES +
    NUM_IMPUTE_MEDIAN_FEATURES +
    NUM_NO_IMPUTE_FEATURES
)

# Create the final processed DataFrame for VIF check
X_processed_final_df = pd.DataFrame(
    X_processed_final, 
    columns=final_feature_names, 
    index=X_final_features.index
)

print(f"Final processed data shape: {X_processed_final_df.shape}") # Should be 54 columns

# --- 6. Final VIF Calculation ---
print("\n--- Final VIF Calculation (Validation) ---")

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    # Add constant for VIF
    df_with_const = df.copy()
    df_with_const['const'] = 1
    vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(len(df.columns))]
    return vif_data.sort_values(by='VIF', ascending=False)

try:
    final_vif_results = calculate_vif(X_processed_final_df)
    print("VIF calculation successful.")
    
    print("\nFINAL VIF results (Top 10):")
    print(final_vif_results.head(10))

    print("\nFINAL VIF results (Bottom 10):")
    print(final_vif_results.tail(10))

except Exception as e:
    print(f"\n[ERROR] An error occurred during final VIF calculation:")
    print(e)
    print("This may be due to remaining collinearity. Please check the feature lists.")

print("\n--- PHASE 3: FEATURE SELECTION COMPLETE ---")
print("We now have a clean, preprocessed, and multicollinearity-free feature set.")


--- PHASE 3: FINAL FEATURE SELECTION (Fixing Finding 2) ---
Original X shape: (51336, 54)
Dropping 8 columns due to high VIF: ['Total_TL', 'Secured_TL', 'Unsecured_TL', 'Tot_Active_TL', 'Tot_Closed_TL', 'pct_active_tl', 'pct_closed_tl', 'pct_of_active_TLs_ever']
New X_final_features shape: (51336, 46)

--- Re-defining Feature Lists ---
Total features categorized: 46 / 46

--- Re-building Preprocessor ---
Final 'final_preprocessor' created.

--- Applying Final Preprocessor & Running VIF Check ---
Final processed data shape: (51336, 54)

--- Final VIF Calculation (Validation) ---
VIF calculation successful.

FINAL VIF results (Top 10):
                   feature        VIF
21                enq_L12m  25.762880
22                 enq_L6m  22.851271
32  pct_PL_enq_L6m_of_L12m  20.019594
34  pct_PL_enq_L6m_of_ever  19.442076
33  pct_CC_enq_L6m_of_L12m  18.826684
19             PL_enq_L12m  17.045485
35  pct_CC_enq_L6m_of_ever  16.886400
13                 tot_enq  12.668333
18             

### 6: Final VIF Pruning

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
print("\n--- PHASE 3: FINAL VIF PRUNING (Round 2) ---")

# --- 1. Define Final Enquiry "Kill List" ---
VIF_DROP_COLS_2 = [
    # Redundant "Total"
    'tot_enq', 
    
    # Redundant "12-Month" columns (we keep the 6-month)
    'enq_L12m', 
    'PL_enq_L12m', 
    'CC_enq_L12m',
    
    # Redundant "Percentage" ratios
    'pct_PL_enq_L6m_of_L12m', 
    'pct_PL_enq_L6m_of_ever', 
    'pct_CC_enq_L6m_of_L12m', 
    'pct_CC_enq_L6m_of_ever'
]

print(f"Starting with {X_final_features.shape[1]} features from last step.")
print(f"Dropping {len(VIF_DROP_COLS_2)} more columns due to high VIF: {VIF_DROP_COLS_2}")

# --- 2. Create Absolute Final Feature Set ---
# We drop these from 'X_final_features' (our 46-col set)
X_final_final_features = X_final_features.drop(columns=VIF_DROP_COLS_2)
print(f"Absolute Final Feature Set shape: {X_final_final_features.shape}") # Should be 38 columns

# --- 3. Re-Define Feature Lists (Final Time) ---
print("\n--- Re-defining Final Feature Lists ---")

# Categorical lists are unchanged
EDUCATION_MAP = [
    'OTHERS', 'SSC', '12TH', 'UNDER GRADUATE', 
    'GRADUATE', 'POST-GRADUATE', 'PROFESSIONAL'
]
ORDINAL_FEATURE = ['EDUCATION']
NOMINAL_FEATURES = [
    'MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'
]

# Numerical lists must be rebuilt from X_final_final_features
NUM_IMPUTE_ZERO_FEATURES = [
    'CC_enq', 'CC_enq_L6m', 'PL_enq', 'PL_enq_L6m', 
    'time_since_recent_enq', 'enq_L6m', 'enq_L3m'
]
NUM_IMPUTE_MEDIAN_FEATURES = ['Age_Oldest_TL', 'Age_Newest_TL']

all_used_cols = (
    ORDINAL_FEATURE + NOMINAL_FEATURES + 
    NUM_IMPUTE_ZERO_FEATURES + NUM_IMPUTE_MEDIAN_FEATURES
)
NUM_NO_IMPUTE_FEATURES = [
    col for col in X_final_final_features.columns if col not in all_used_cols
]

print(f"Final Num (impute 0) features (7): {NUM_IMPUTE_ZERO_FEATURES}")
print(f"Final Num (no impute) features ({len(NUM_NO_IMPUTE_FEATURES)}): {NUM_NO_IMPUTE_FEATURES}")
print(f"Total features categorized: {len(all_used_cols) + len(NUM_NO_IMPUTE_FEATURES)} / {X_final_final_features.shape[1]}")


# --- 4. Re-Build Final Preprocessing Pipeline ---
print("\n--- Re-building Final Preprocessor ---")

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[EDUCATION_MAP], handle_unknown='use_encoded_value', unknown_value=-1))
])
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])
num_zero_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', RobustScaler())
])
num_median_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
num_no_impute_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

# Re-build the master ColumnTransformer
final_preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ORDINAL_FEATURE),
        ('nominal', nominal_transformer, NOMINAL_FEATURES),
        ('num_impute_zero', num_zero_impute_transformer, NUM_IMPUTE_ZERO_FEATURES),
        ('num_impute_median', num_median_impute_transformer, NUM_IMPUTE_MEDIAN_FEATURES),
        ('num_no_impute', num_no_impute_transformer, NUM_NO_IMPUTE_FEATURES)
    ],
    remainder='passthrough' # This should be empty
)
print("Final 'final_preprocessor' created.")

# --- 5. Apply Final Preprocessor & Run VIF (Final Validation) ---
print("\n--- Applying Final Preprocessor & Running FINAL VIF Check ---")

X_processed_final = final_preprocessor.fit_transform(X_final_final_features)
final_feature_names = (
    ORDINAL_FEATURE +
    final_preprocessor.named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(NOMINAL_FEATURES).tolist() +
    NUM_IMPUTE_ZERO_FEATURES +
    NUM_IMPUTE_MEDIAN_FEATURES +
    NUM_NO_IMPUTE_FEATURES
)
X_processed_final_df = pd.DataFrame(
    X_processed_final, 
    columns=final_feature_names, 
    index=X_final_final_features.index
)
print(f"Final processed data shape: {X_processed_final_df.shape}") # Should be 46 columns

# --- 6. Final VIF Calculation ---
print("\n--- Final VIF Calculation (Validation) ---")
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    df_with_const = df.copy()
    df_with_const['const'] = 1
    vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(len(df.columns))]
    return vif_data.sort_values(by='VIF', ascending=False)

final_vif_results = calculate_vif(X_processed_final_df)

print("\nFINAL VIF results (Top 10):")
print(final_vif_results.head(10))


--- PHASE 3: FINAL VIF PRUNING (Round 2) ---
Starting with 46 features from last step.
Dropping 8 more columns due to high VIF: ['tot_enq', 'enq_L12m', 'PL_enq_L12m', 'CC_enq_L12m', 'pct_PL_enq_L6m_of_L12m', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_L12m', 'pct_CC_enq_L6m_of_ever']
Absolute Final Feature Set shape: (51336, 38)

--- Re-defining Final Feature Lists ---
Final Num (impute 0) features (7): ['CC_enq', 'CC_enq_L6m', 'PL_enq', 'PL_enq_L6m', 'time_since_recent_enq', 'enq_L6m', 'enq_L3m']
Final Num (no impute) features (24): ['AGE', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'pct_opened_TLs_L6m_of_L12m', 'CC_Flag', 'PL_Flag', 'HL_Flag', 'GL_Flag', 'Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Total_TL_opened_L12M', 'Tot_TL_closed_L12M', 'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'Auto_TL', 'CC_TL', 'Consumer_TL', 'Gold_TL', 'Home_TL', 'PL_TL', 'Other_TL']
Total features categorized: 38 / 38

--- Re-building Final Prepr

**Phase 3: Feature Engineering & Selection**.

Our primary goal in this phase was to operationalize our EDA findings by building a robust preprocessing pipeline and, most importantly, rigorously validating and resolving **Finding 2 (Multicollinearity)**.

### 1. Building the `ColumnTransformer` (Validating Finding 3)

First, we translated all our rules from Phase 2 into a `scikit-learn` `ColumnTransformer`:

* **Ordinal Features (Finding 3):** We successfully implemented our custom, business-logic mapping for `EDUCATION` (e.g., `'SSC'`: 1, `'12TH'`: 2, ... `'PROFESSIONAL'`: 6) using an `OrdinalEncoder`.
* **Nominal Features (Finding 3):** We configured a `OneHotEncoder` for our 4 nominal features. We fixed a critical "Dummy Variable Trap" by setting `drop='first'`, which is essential for VIF analysis and modeling.
* **Numerical Features (EDA Rules):** We built distinct pipelines to handle all our numerical imputation rules from Phase 2:
    * **Impute with 0:** All 11 "enquiry" features (where `NaN` means "zero").
    * **Impute with Median:** The 2 "Age" features with minor missingness.
    * **Scale:** Applied `RobustScaler` to *all* numerical features to protect our model from the extreme outliers (like in `NETMONTHLYINCOME`) we identified in our EDA.

### 2. Iterative VIF Analysis (Resolving Finding 2)

This was the core task of Phase 3 and required a rigorous, multi-step process to eliminate all multicollinearity.

* **Round 1: Identifying "Total" Redundancy**
    * Our first VIF run (after fixing the dummy trap) perfectly validated **Finding 2**. We saw `inf` and multi-million VIFs for all the "total" and "status" columns (e.g., `Total_TL`, `Tot_Active_TL`, `Secured_TL`, `pct_active_tl`).
    * **Action:** We **dropped 8 composite/total columns**, adhering to our project rule to "keep only the most granular features" (like `Auto_TL`, `CC_TL`, `Home_TL`, etc.).

* **Round 2: Identifying "Enquiry" Redundancy**
    * With the main problem fixed, a second VIF run revealed a more subtle multicollinearity (VIFs ~25) within the "enquiry" features.
    * The logic was the same: features like `enq_L12m` were redundant with `enq_L6m`, and `tot_enq` was a redundant composite.
    * **Action:** We **dropped 8 more redundant "enquiry" columns**, keeping only the most granular and recent time windows (e.g., `enq_L6m`, `enq_L3m`, `PL_enq_L6m`).

### 3. Final Outcome & Key Deliverables

* **Clean VIF:** Our final VIF report is clean. All values are now at an acceptable level (highest ~10.3), proving our feature set is free of destructive multicollinearity.
* **Final Feature Set:** We have successfully engineered our final, model-ready feature set. We have rigorously pruned our data from 58 initial features (post-leakage) down to **38 high-quality, granular, and independent features**.
* **Key Asset:** The main deliverable of this phase is our `final_preprocessor` object. This `ColumnTransformer` contains all our validated rules for imputation, scaling, and encoding, ready to be the first step in our modeling pipeline.