## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

print(" Libraries imported successfully!")

 Libraries imported successfully!


## 2. Load Cleaned Data

In [2]:
# Load cleaned data
print(" Loading cleaned data...")

train_df = pd.read_csv("../data/processed/train_cleaned.csv")
test_df = pd.read_csv("../data/processed/test_cleaned.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Separate features and target
X_train = train_df.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_train = train_df['TARGET']
X_test = test_df.drop(['SK_ID_CURR'], axis=1)

print(f"\nFeatures shape: {X_train.shape}")
print(f"Target distribution:\n{y_train.value_counts(normalize=True)}")

 Loading cleaned data...
Train shape: (307511, 495)
Test shape: (48744, 494)

Features shape: (307511, 493)
Target distribution:
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


## 3. Create Domain-Specific Features

### 3.1 Credit Utilization Features

In [3]:
def create_credit_features(df):
    """Create credit-related features"""
    df_new = df.copy()
    
    # Credit to Annuity ratio
    df_new['CREDIT_TO_ANNUITY_RATIO'] = df_new['AMT_CREDIT'] / (df_new['AMT_ANNUITY'] + 1)
    
    # Credit to Goods Price ratio
    df_new['CREDIT_TO_GOODS_RATIO'] = df_new['AMT_CREDIT'] / (df_new['AMT_GOODS_PRICE'] + 1)
    
    # Annuity to Income ratio
    df_new['ANNUITY_TO_INCOME_RATIO'] = df_new['AMT_ANNUITY'] / (df_new['AMT_INCOME_TOTAL'] + 1)
    
    # Credit to Income ratio
    df_new['CREDIT_TO_INCOME_RATIO'] = df_new['AMT_CREDIT'] / (df_new['AMT_INCOME_TOTAL'] + 1)
    
    # Income per family member
    df_new['INCOME_PER_PERSON'] = df_new['AMT_INCOME_TOTAL'] / (df_new['CNT_FAM_MEMBERS'] + 1)
    
    return df_new

print(" Creating credit features...")
X_train = create_credit_features(X_train)
X_test = create_credit_features(X_test)

print(f" Credit features created. New shape: {X_train.shape}")

 Creating credit features...
 Credit features created. New shape: (307511, 498)


### 3.2 Time-based Features

In [4]:
def create_time_features(df):
    """Create time-related features"""
    df_new = df.copy()
    
    # Age in years
    df_new['AGE_YEARS'] = -df_new['DAYS_BIRTH'] / 365
    
    # Employment years
    df_new['EMPLOYMENT_YEARS'] = -df_new['DAYS_EMPLOYED'] / 365
    
    # Registration years
    df_new['REGISTRATION_YEARS'] = -df_new['DAYS_REGISTRATION'] / 365
    
    # ID publish years ago
    df_new['ID_PUBLISH_YEARS'] = -df_new['DAYS_ID_PUBLISH'] / 365
    
    # Employment to age ratio
    df_new['EMPLOYMENT_TO_AGE_RATIO'] = df_new['EMPLOYMENT_YEARS'] / (df_new['AGE_YEARS'] + 1)
    
    return df_new

print(" Creating time-based features...")
X_train = create_time_features(X_train)
X_test = create_time_features(X_test)

print(f" Time features created. New shape: {X_train.shape}")

 Creating time-based features...
 Time features created. New shape: (307511, 503)


### 3.3 Aggregated Bureau Features

In [5]:
def create_bureau_features(df):
    """Create features from bureau aggregations"""
    df_new = df.copy()
    
    # Check if bureau columns exist
    bureau_cols = [col for col in df.columns if 'BUREAU' in col]
    
    if len(bureau_cols) > 0:
        # Average bureau credit
        credit_cols = [col for col in bureau_cols if 'AMT_CREDIT' in col and 'mean' in col]
        if len(credit_cols) > 0:
            df_new['BUREAU_AVG_CREDIT'] = df_new[credit_cols].mean(axis=1)
        
        # Bureau credit count
        count_cols = [col for col in bureau_cols if 'count' in col]
        if len(count_cols) > 0:
            df_new['BUREAU_TOTAL_COUNT'] = df_new[count_cols].sum(axis=1)
    
    return df_new

print("ðŸ”„ Creating bureau features...")
X_train = create_bureau_features(X_train)
X_test = create_bureau_features(X_test)

print(f" Bureau features created. New shape: {X_train.shape}")

ðŸ”„ Creating bureau features...
 Bureau features created. New shape: (307511, 505)


### 3.4 Document and External Source Features

In [6]:
def create_document_features(df):
    """Create features from documents and external sources"""
    df_new = df.copy()
    
    # Total documents provided
    doc_cols = [col for col in df.columns if 'FLAG_DOCUMENT' in col]
    if len(doc_cols) > 0:
        df_new['TOTAL_DOCUMENTS'] = df_new[doc_cols].sum(axis=1)
    
    # Average external source score
    ext_cols = [col for col in df.columns if 'EXT_SOURCE' in col]
    if len(ext_cols) > 0:
        df_new['EXT_SOURCE_MEAN'] = df_new[ext_cols].mean(axis=1)
        df_new['EXT_SOURCE_STD'] = df_new[ext_cols].std(axis=1)
        df_new['EXT_SOURCE_MIN'] = df_new[ext_cols].min(axis=1)
        df_new['EXT_SOURCE_MAX'] = df_new[ext_cols].max(axis=1)
    
    return df_new

print(" Creating document and external source features...")
X_train = create_document_features(X_train)
X_test = create_document_features(X_test)

print(f" Document features created. New shape: {X_train.shape}")

 Creating document and external source features...
 Document features created. New shape: (307511, 510)


## 4. Handle Categorical Variables

In [7]:
# Identify categorical columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Categorical columns: {len(cat_cols)}")
print(cat_cols)

# One-hot encode categorical variables
if len(cat_cols) > 0:
    print("\n One-hot encoding categorical variables...")
    X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
    
    # Align columns
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    print(f" Encoding complete. New shape: {X_train.shape}")

Categorical columns: 16
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

 One-hot encoding categorical variables...
 Encoding complete. New shape: (307511, 618)


## 5. Feature Selection

In [8]:
# Remove features with zero variance
print(" Removing zero-variance features...")

variance = X_train.var()
zero_var_cols = variance[variance == 0].index.tolist()

print(f"   Features with zero variance: {len(zero_var_cols)}")

if len(zero_var_cols) > 0:
    X_train = X_train.drop(columns=zero_var_cols)
    X_test = X_test.drop(columns=zero_var_cols)

print(f" Shape after removing zero variance: {X_train.shape}")

 Removing zero-variance features...
   Features with zero variance: 2
 Shape after removing zero variance: (307511, 616)


In [9]:
# Feature importance using SelectKBest
print("\n Selecting top features using ANOVA F-test...")

# Select top 200 features
k_best = min(200, X_train.shape[1])
selector = SelectKBest(f_classif, k=k_best)

X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = X_train.columns[selector.get_support()].tolist()

print(f" Selected {len(selected_features)} features")
print(f"\nTop 20 selected features:")
print(selected_features[:20])


 Selecting top features using ANOVA F-test...
 Selected 200 features

Top 20 selected features:
['AMT_CREDIT', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG']


In [10]:
# Create DataFrames with selected features
X_train_final = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_final = pd.DataFrame(X_test_selected, columns=selected_features)

print(f"Final training shape: {X_train_final.shape}")
print(f"Final test shape: {X_test_final.shape}")

Final training shape: (307511, 200)
Final test shape: (48744, 200)


## 6. Feature Scaling

In [11]:
# Use RobustScaler (better for outliers)
print(" Scaling features using RobustScaler...")

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features)

print(f" Scaling complete!")
print(f"\nScaled data statistics:")
print(X_train_scaled.describe())

 Scaling features using RobustScaler...
 Scaling complete!

Scaled data statistics:
          AMT_CREDIT  AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  \
count  307511.000000    307511.000000               307511.000000   
mean        0.158721         0.200264                    0.108169   
std         0.747221         0.837390                    0.741345   
min        -0.869825        -0.928571                   -0.994801   
25%        -0.452114        -0.479592                   -0.474031   
50%         0.000000         0.000000                    0.000000   
75%         0.547886         0.520408                    0.525969   
max         6.565430         8.163265                    2.876025   

          DAYS_BIRTH  DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  \
count  307511.000000  307511.000000      307511.000000    307511.000000   
mean       -0.039482      -0.330381          -0.088147         0.100736   
std         0.600356       1.169236           0.644097         0.5852

## 7. Handle Class Imbalance (SMOTE)

In [12]:
print(" Applying SMOTE to balance classes...")
print(f"\nBefore SMOTE:")
print(y_train.value_counts())

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE:")
print(pd.Series(y_train_resampled).value_counts())
print(f"\n Resampled training shape: {X_train_resampled.shape}")

 Applying SMOTE to balance classes...

Before SMOTE:
TARGET
0    282686
1     24825
Name: count, dtype: int64

After SMOTE:
TARGET
0    282686
1    141343
Name: count, dtype: int64

 Resampled training shape: (424029, 200)


## 8. Save Engineered Features

In [13]:
import os

# Create features directory
features_dir = "../data/features"
os.makedirs(features_dir, exist_ok=True)

print(" Saving engineered features...")

# Save without SMOTE (for testing different strategies)
pd.DataFrame(X_train_scaled).to_csv(f"{features_dir}/X_train_scaled.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv(f"{features_dir}/X_test_scaled.csv", index=False)
pd.Series(y_train).to_csv(f"{features_dir}/y_train.csv", index=False, header=['TARGET'])

# Save with SMOTE
pd.DataFrame(X_train_resampled, columns=selected_features).to_csv(f"{features_dir}/X_train_smote.csv", index=False)
pd.Series(y_train_resampled).to_csv(f"{features_dir}/y_train_smote.csv", index=False, header=['TARGET'])

# Save feature names
pd.DataFrame({'feature': selected_features}).to_csv(f"{features_dir}/feature_names.csv", index=False)

print(" Features saved successfully!")
print(f"\nSaved files:")
for f in os.listdir(features_dir):
    print(f"   - {f}")

 Saving engineered features...
 Features saved successfully!

Saved files:
   - feature_names.csv
   - X_test_scaled.csv
   - X_train_scaled.csv
   - X_train_smote.csv
   - y_train.csv
   - y_train_smote.csv


## 9. Summary

In [16]:
print("="*60)
print(" FEATURE ENGINEERING SUMMARY")
print("="*60)

print(f"\n Feature Creation:")
print(f"   Original features: {train_df.shape[1] - 2}")
print(f"   After engineering: {X_train_final.shape[1]}")
print(f"   Selected features: {len(selected_features)}")

print(f"\n Class Balance:")
print(f"   Original: {y_train.value_counts().to_dict()}")
print(f"   After SMOTE: {pd.Series(y_train_resampled).value_counts().to_dict()}")

print(f"\n Output Files:")
print(f"   - X_train_scaled.csv ({X_train_scaled.shape})")
print(f"   - X_train_smote.csv ({X_train_resampled.shape})")
print(f"   - X_test_scaled.csv ({X_test_scaled.shape})")
print(f"   - y_train.csv, y_train_smote.csv")
print(f"   - feature_names.csv")




 FEATURE ENGINEERING SUMMARY

 Feature Creation:
   Original features: 493
   After engineering: 200
   Selected features: 200

 Class Balance:
   Original: {0: 282686, 1: 24825}
   After SMOTE: {0: 282686, 1: 141343}

 Output Files:
   - X_train_scaled.csv ((307511, 200))
   - X_train_smote.csv ((424029, 200))
   - X_test_scaled.csv ((48744, 200))
   - y_train.csv, y_train_smote.csv
   - feature_names.csv
