In [None]:
# Summary statistics
print("\n" + "="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)
print(f"\nOriginal base table rows: {base.shape[0]}")
print(f"Original base table features: {base.shape[1]}")
print(f"\nEngineered dataset rows: {engineered_df_encoded.shape[0]}")
print(f"Engineered dataset features: {engineered_df_encoded.shape[1]}")
print(f"Features added: {engineered_df_encoded.shape[1] - base.shape[1]}")
print(f"\nFiltered dataset features: {engineered_df_filtered.shape[1]}")
print(f"Features removed (due to high correlation): {engineered_df_encoded.shape[1] - engineered_df_filtered.shape[1]}")

print(f"\nFeature sources:")
print(f"  - Base table: {base.shape[1]} features")
print(f"  - Application history (applprev): {len(applprev_features.columns)} engineered features")
print(f"  - Credit bureau: {len(creditbureau_features.columns)} engineered features")
print(f"  - Person data: {len(person_features.columns)} features")
print(f"  - Tax registry: {len(tax_features.columns)} features")
print(f"  - Interaction features: {len(interaction_features.columns)} features")

print(f"\nOutput files created:")
print(f"  1. test_engineered_features_full.csv - All engineered features with encoding")
print(f"  2. test_engineered_features_filtered.csv - Features after correlation filtering")
print(f"  3. test_engineered_features_scaled.csv - Scaled features for ML models")
print(f"  4. test_engineered_features_metadata.csv - Feature metadata and missing value info")

In [None]:
# Export different versions of the engineered features

# 1. Full engineered dataset with encoded categorical variables
output_file_1 = f'{test_folder}/test_engineered_features_full.csv'
engineered_df_encoded.to_csv(output_file_1, index=False)
print(f"✓ Exported full engineered features to: {output_file_1}")

# 2. Filtered dataset (removed highly correlated features)
output_file_2 = f'{test_folder}/test_engineered_features_filtered.csv'
engineered_df_filtered.to_csv(output_file_2, index=False)
print(f"✓ Exported filtered engineered features to: {output_file_2}")

# 3. Scaled dataset (for models that benefit from scaling)
output_file_3 = f'{test_folder}/test_engineered_features_scaled.csv'
engineered_df_scaled.to_csv(output_file_3, index=False)
print(f"✓ Exported scaled engineered features to: {output_file_3}")

# 4. Feature metadata
feature_metadata = pd.DataFrame({
    'Feature': engineered_df_encoded.columns,
    'Data_Type': engineered_df_encoded.dtypes.values,
    'Missing_Count': engineered_df_encoded.isnull().sum().values,
    'Missing_Percentage': (engineered_df_encoded.isnull().sum() / len(engineered_df_encoded) * 100).values
})

output_file_4 = f'{test_folder}/test_engineered_features_metadata.csv'
feature_metadata.to_csv(output_file_4, index=False)
print(f"✓ Exported feature metadata to: {output_file_4}")

## 13. Export Engineered Features

In [None]:
# Remove highly correlated features to reduce multicollinearity
def remove_highly_correlated_features(df, corr_matrix, threshold=0.95):
    """Remove one of each highly correlated pair"""
    to_drop = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                # Drop the second feature (j) to keep first ones
                to_drop.add(corr_matrix.columns[j])
    
    print(f"Removing {len(to_drop)} highly correlated features")
    df_filtered = df.drop(columns=list(to_drop), errors='ignore')
    return df_filtered

engineered_df_filtered = remove_highly_correlated_features(
    engineered_df_encoded, correlation_matrix, threshold=0.95
)

print(f"Dataset shape after removing highly correlated features: {engineered_df_filtered.shape}")
print(f"Original features: {engineered_df_encoded.shape[1]}, Filtered features: {engineered_df_filtered.shape[1]}")

In [None]:
# Feature correlation analysis
numeric_features_df = engineered_df_encoded.select_dtypes(include=[np.number])

# Calculate correlation matrix
correlation_matrix = numeric_features_df.corr()

# Find highly correlated features
def find_highly_correlated_pairs(corr_matrix, threshold=0.95):
    """Find pairs of features with high correlation"""
    pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                pairs.append({
                    'Feature 1': corr_matrix.columns[i],
                    'Feature 2': corr_matrix.columns[j],
                    'Correlation': corr_matrix.iloc[i, j]
                })
    return pairs

highly_correlated = find_highly_correlated_pairs(correlation_matrix, threshold=0.95)
print(f"Found {len(highly_correlated)} highly correlated pairs (>0.95 correlation):")
if highly_correlated:
    for pair in highly_correlated[:10]:  # Show first 10
        print(f"  {pair['Feature 1']} <-> {pair['Feature 2']}: {pair['Correlation']:.4f}")
else:
    print("  No highly correlated pairs found")

## 12. Feature Selection - Correlation Analysis

In [None]:
# Feature scaling - apply StandardScaler to numeric features
numeric_cols_all = engineered_df_encoded.select_dtypes(include=[np.number]).columns

# Create a separate scaled version for features that might need scaling
scaler = StandardScaler()
engineered_df_scaled = engineered_df_encoded.copy()

# Scale numeric features
engineered_df_scaled[numeric_cols_all] = scaler.fit_transform(engineered_df_encoded[numeric_cols_all])

print(f"Features scaled using StandardScaler")
print(f"Scaled dataset shape: {engineered_df_scaled.shape}")
print(f"\nScaled features statistics (first 5 columns):")
print(engineered_df_scaled.iloc[:, :5].describe())

## 11. Feature Scaling and Normalization

In [None]:
# Identify categorical columns
categorical_cols = engineered_df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}")

# Use label encoding for categorical features
le_dict = {}
engineered_df_encoded = engineered_df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    # Handle missing values before encoding
    engineered_df_encoded[col] = engineered_df_encoded[col].fillna('Missing')
    engineered_df_encoded[col] = le.fit_transform(engineered_df_encoded[col].astype(str))
    le_dict[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

print(f"\nFinal engineered dataset shape: {engineered_df_encoded.shape}")
print(f"Data types after encoding:")
print(engineered_df_encoded.dtypes.value_counts())

## 10. Encode Categorical Variables

In [None]:
def handle_outliers_iqr(df, columns=None, multiplier=1.5):
    """Handle outliers using IQR method"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_clean = df.copy()
    
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        
        # Cap outliers instead of removing
        df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
    
    return df_clean

# Apply outlier handling to numeric features
numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns
engineered_df[numeric_cols] = handle_outliers_iqr(engineered_df, numeric_cols, multiplier=1.5)

print("Outliers handled using IQR method (capped rather than removed)")
print(f"Dataset shape after outlier handling: {engineered_df.shape}")

## 9. Handle Outliers

In [None]:
# Analyze missing values in engineered dataset
analyze_missing_values(engineered_df, "Engineered Dataset")

# Fill missing values
numeric_features = engineered_df.select_dtypes(include=[np.number]).columns
categorical_features = engineered_df.select_dtypes(include=['object']).columns

# For numeric features, use median imputation
numeric_imputer = SimpleImputer(strategy='median')
engineered_df[numeric_features] = numeric_imputer.fit_transform(engineered_df[numeric_features])

# For categorical features, use most_frequent imputation
if len(categorical_features) > 0:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    engineered_df[categorical_features] = categorical_imputer.fit_transform(engineered_df[categorical_features])

print(f"After imputation - Missing values: {engineered_df.isnull().sum().sum()}")
print(f"Engineered dataset shape after imputation: {engineered_df.shape}")

## 8. Handle Missing Values in Engineered Features

In [None]:
# Start with base table
engineered_df = base.copy()

# Merge applprev features
if len(applprev_features) > 0:
    applprev_features_reindex = applprev_features.reindex(engineered_df['case_id'], fill_value=0)
    engineered_df = pd.concat([engineered_df, applprev_features_reindex], axis=1)
    print(f"After applprev merge: {engineered_df.shape}")

# Merge credit bureau features
if len(creditbureau_features) > 0:
    cb_features_reindex = creditbureau_features.reindex(engineered_df['case_id'], fill_value=0)
    engineered_df = pd.concat([engineered_df, cb_features_reindex], axis=1)
    print(f"After creditbureau merge: {engineered_df.shape}")

# Merge person features
if len(person_features) > 0:
    engineered_df = engineered_df.join(person_features, on='case_id', how='left')
    print(f"After person merge: {engineered_df.shape}")

# Merge tax features
if len(tax_features) > 0:
    engineered_df = engineered_df.join(tax_features, on='case_id', how='left')
    print(f"After tax merge: {engineered_df.shape}")

# Merge interaction features
if len(interaction_features) > 0:
    engineered_df = engineered_df.join(interaction_features, on='case_id', how='left')
    print(f"After interaction features merge: {engineered_df.shape}")

print(f"\nFinal engineered dataset shape: {engineered_df.shape}")
print(f"Columns: {engineered_df.shape[1]}")
print(f"Sample of engineered dataset:")
print(engineered_df.head())

## 7. Merge All Features with Base Table

In [None]:
# Create interaction features
interaction_features = pd.DataFrame(index=base['case_id'])

# Interaction between applprev and creditbureau if they share keys
if len(applprev_features) > 0 and len(creditbureau_features) > 0:
    # Reindex to align with base table
    applprev_aligned = applprev_features.reindex(base['case_id'], fill_value=0)
    creditbureau_aligned = creditbureau_features.reindex(base['case_id'], fill_value=0)
    
    # Create some key interactions
    if 'applprev_count' in applprev_aligned.columns and 'cb_num_records' in creditbureau_aligned.columns:
        interaction_features['applprev_cb_count_ratio'] = (
            applprev_aligned['applprev_count'] / (creditbureau_aligned['cb_num_records'] + 1)
        )
    
    if 'applprev_total_credit_amount' in applprev_aligned.columns and 'cb_total_outstanding' in creditbureau_aligned.columns:
        interaction_features['credit_to_outstanding_ratio'] = (
            applprev_aligned['applprev_total_credit_amount'] / (creditbureau_aligned['cb_total_outstanding'] + 1)
        )

print(f"Interaction features shape: {interaction_features.shape}")
print(f"Interaction features:")
print(interaction_features.head())

## 6. Create Interaction Features

Generate interaction features to capture non-linear relationships.

In [None]:
# Process tax registry table
tax_features = pd.DataFrame(index=base['case_id'])

if tax_registry is not None:
    # Aggregate tax registry data
    tax_unique = tax_registry.drop_duplicates(subset=['case_id'], keep='first')
    
    # Numeric features from tax registry
    numeric_tax = tax_unique.select_dtypes(include=[np.number]).columns.tolist()
    
    for col in numeric_tax:
        if col != 'case_id':
            tax_features[f'tax_{col}'] = tax_unique.set_index('case_id')[col]
    
    # Categorical features
    categorical_tax = tax_unique.select_dtypes(include=['object']).columns.tolist()
    
    for col in categorical_tax[:3]:
        if col != 'case_id':
            tax_features[f'tax_{col}'] = tax_unique.set_index('case_id')[col]
    
    print(f"Tax registry features shape: {tax_features.shape}")
    print(f"Tax registry features sample:")
    print(tax_features.head())
else:
    print("No tax registry table available")

In [None]:
# Process person table
person_features = pd.DataFrame(index=base['case_id'])

if person is not None:
    # Keep unique person records (in case of duplicates)
    person_unique = person.drop_duplicates(subset=['case_id'], keep='first')
    
    # Numeric features from person table
    numeric_person = person_unique.select_dtypes(include=[np.number]).columns.tolist()
    
    for col in numeric_person:
        if col != 'case_id':
            person_features[f'person_{col}'] = person_unique.set_index('case_id')[col]
    
    # Categorical features from person table
    categorical_person = person_unique.select_dtypes(include=['object']).columns.tolist()
    
    for col in categorical_person[:5]:  # Limit to first 5 categorical columns to avoid explosion
        if col != 'case_id':
            person_features[f'person_{col}'] = person_unique.set_index('case_id')[col]
    
    print(f"Person features shape: {person_features.shape}")
    print(f"Person features sample:")
    print(person_features.head())
else:
    print("No person table available")

## 5. Create Features from Person and Tax Registry Tables

In [None]:
# Additional credit bureau features - counts and key metrics
creditbureau_features = pd.DataFrame(index=creditbureau['case_id'].unique())

# Count number of credit bureau records
creditbureau_features['cb_num_records'] = creditbureau.groupby('case_id').size()

# Count contracts by status
if 'contractst_545M' in creditbureau.columns:
    contract_counts = creditbureau.groupby('case_id')['contractst_545M'].apply(
        lambda x: pd.Series({
            f'cb_contractst_{status}_count': (x == status).sum() 
            for status in x.dropna().unique()
        })
    ).fillna(0)
    creditbureau_features = creditbureau_features.join(contract_counts)

# Overdue amount features
if 'overdueamount_31A' in creditbureau.columns:
    creditbureau_features['cb_total_overdue_amount'] = creditbureau.groupby('case_id')['overdueamount_31A'].sum()
    creditbureau_features['cb_max_overdue_amount'] = creditbureau.groupby('case_id')['overdueamount_31A'].max()
    creditbureau_features['cb_avg_overdue_amount'] = creditbureau.groupby('case_id')['overdueamount_31A'].mean()

# Outstanding debt features
if 'outstandingamount_354A' in creditbureau.columns:
    creditbureau_features['cb_total_outstanding'] = creditbureau.groupby('case_id')['outstandingamount_354A'].sum()
    creditbureau_features['cb_max_outstanding'] = creditbureau.groupby('case_id')['outstandingamount_354A'].max()

# DPD (Days Past Due) features
if 'dpdmax_139P' in creditbureau.columns:
    creditbureau_features['cb_max_dpd'] = creditbureau.groupby('case_id')['dpdmax_139P'].max()
    creditbureau_features['cb_avg_dpd'] = creditbureau.groupby('case_id')['dpdmax_139P'].mean()

print(f"Credit bureau engineered features shape: {creditbureau_features.shape}")
print(f"Sample engineered features:")
print(creditbureau_features.head())

In [None]:
# Select numeric columns from credit bureau for aggregation
numeric_cols_cb = creditbureau.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns in credit bureau (count: {len(numeric_cols_cb)}):")
print(numeric_cols_cb[:20])

# Aggregate features by case_id
creditbureau_agg = creditbureau.groupby('case_id')[numeric_cols_cb].agg([
    'sum', 'mean', 'max', 'min', 'count'
]).fillna(0)

# Flatten column names
creditbureau_agg.columns = ['cb_' + '_'.join(col).strip() for col in creditbureau_agg.columns]

print(f"\nAggregated credit bureau features shape: {creditbureau_agg.shape}")
print(f"Sample aggregated features:")
print(creditbureau_agg.head())

## 4. Create Numerical Features from Credit Bureau Table

Engineer aggregated features from the credit bureau data.

In [None]:
# Additional application/previous features - counts and ratios
applprev_features = applprev.groupby('case_id').agg({
    'status_219L': 'count',  # Number of previous applications
}).rename(columns={'status_219L': 'applprev_count'})

# Count by status if available
if 'status_219L' in applprev.columns:
    status_counts = applprev.groupby('case_id')['status_219L'].apply(
        lambda x: pd.Series({
            f'applprev_status_{status}_count': (x == status).sum() 
            for status in x.unique() if status is not None
        })
    ).fillna(0)
    applprev_features = applprev_features.join(status_counts)

# Add amount-based features if available
if 'credamount_590A' in applprev.columns:
    applprev_features['applprev_total_credit_amount'] = applprev.groupby('case_id')['credamount_590A'].sum()
    applprev_features['applprev_avg_credit_amount'] = applprev.groupby('case_id')['credamount_590A'].mean()
    applprev_features['applprev_max_credit_amount'] = applprev.groupby('case_id')['credamount_590A'].max()

if 'annuity_853A' in applprev.columns:
    applprev_features['applprev_total_annuity'] = applprev.groupby('case_id')['annuity_853A'].sum()
    applprev_features['applprev_avg_annuity'] = applprev.groupby('case_id')['annuity_853A'].mean()

print(f"Applprev engineered features shape: {applprev_features.shape}")
print(f"Sample engineered features:")
print(applprev_features.head())

In [None]:
# Select numeric columns from applprev for aggregation
numeric_cols_applprev = applprev.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns in applprev (count: {len(numeric_cols_applprev)}):")
print(numeric_cols_applprev[:20])  # Show first 20

# Aggregate features by case_id
applprev_agg = applprev.groupby('case_id')[numeric_cols_applprev].agg([
    'sum', 'mean', 'max', 'min', 'std', 'count'
]).fillna(0)

# Flatten column names
applprev_agg.columns = ['_'.join(col).strip() for col in applprev_agg.columns]

print(f"\nAggregated applprev features shape: {applprev_agg.shape}")
print(f"Sample aggregated features:")
print(applprev_agg.head())

## 3. Create Numerical Features from Application/Previous Table

Engineer aggregated numerical features from the applprev table.

In [None]:
def analyze_missing_values(df, name=""):
    """Analyze missing values in a dataframe"""
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Column': missing.index,
        'Missing_Count': missing.values,
        'Missing_Percentage': missing_pct.values
    }).sort_values('Missing_Percentage', ascending=False)
    
    missing_df = missing_df[missing_df['Missing_Count'] > 0]
    
    if len(missing_df) > 0:
        print(f"\n{name} - Missing Values Summary:")
        print(missing_df.to_string(index=False))
    else:
        print(f"\n{name} - No missing values found!")
    
    return missing_df

# Analyze missing values in each table
analyze_missing_values(base, "Base Table")
analyze_missing_values(applprev, "Application/Previous Table")
analyze_missing_values(creditbureau, "Credit Bureau Table")
if person is not None:
    analyze_missing_values(person, "Person Table")

## 2. Handle Missing Values

Analyze and handle missing values in the dataset using appropriate techniques.

In [None]:
# Load other important tables
person_tables = []
for file in sorted([f for f in os.listdir(test_folder) if f.startswith('test_person')]):
    df = pd.read_csv(f'{test_folder}/{file}')
    person_tables.append(df)
    print(f"Loaded {file}: shape {df.shape}")

person = pd.concat(person_tables, ignore_index=False) if person_tables else None
print(f"Combined person shape: {person.shape if person is not None else 'None'}")

# Load tax registry tables
tax_registry = []
for file in sorted([f for f in os.listdir(test_folder) if f.startswith('test_tax_registry')]):
    df = pd.read_csv(f'{test_folder}/{file}')
    tax_registry.append(df)
    print(f"Loaded {file}: shape {df.shape}")

tax_registry = pd.concat(tax_registry, ignore_index=False) if tax_registry else None
print(f"Combined tax registry shape: {tax_registry.shape if tax_registry is not None else 'None'}")

In [None]:
# Load credit bureau tables
creditbureau_tables = []
creditbureau_files = sorted([f for f in os.listdir(test_folder) if f.startswith('test_credit_bureau')])
print(f"Found {len(creditbureau_files)} credit bureau files")

for file in creditbureau_files:
    df = pd.read_csv(f'{test_folder}/{file}')
    creditbureau_tables.append(df)
    print(f"  {file}: shape {df.shape}")

# Combine all credit bureau tables
creditbureau = pd.concat(creditbureau_tables, ignore_index=False)
print(f"\nCombined credit bureau shape: {creditbureau.shape}")
print(f"Credit bureau columns (first 10): {creditbureau.columns.tolist()[:10]}")

In [None]:
# Load application/previous credit tables
applprev_tables = []
applprev_files = sorted([f for f in os.listdir(test_folder) if f.startswith('test_applprev')])
print(f"\nFound {len(applprev_files)} applprev files: {applprev_files}")

for file in applprev_files:
    df = pd.read_csv(f'{test_folder}/{file}')
    applprev_tables.append(df)
    print(f"  {file}: shape {df.shape}")

# Combine all applprev tables
applprev = pd.concat(applprev_tables, ignore_index=False)
print(f"\nCombined applprev shape: {applprev.shape}")
print(f"Applprev columns (first 10): {applprev.columns.tolist()[:10]}")

In [None]:
# Load base table
base = pd.read_csv(f'{test_folder}/test_base.csv')
print("Base table shape:", base.shape)
print("\nBase table columns:")
print(base.columns.tolist())
print("\nBase table sample:")
print(base.head())
print("\nBase table info:")
print(base.info())

## 1. Load and Explore Data

In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import glob

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Define the test folder path
test_folder = '/Users/ryanzhang/Documents/Uni stuff/sc4000/home-credit-credit-risk-model-stability/csv_files/test'
print(f"Working directory: {test_folder}")
print(f"Files in directory: {len(os.listdir(test_folder))}")

# Home Credit - Credit Risk Model Stability
## Feature Engineering Notebook

This notebook provides comprehensive feature engineering for the Home Credit Credit Risk Model Stability Kaggle project. We'll load test data, create aggregated features from multiple tables, handle missing values, and prepare engineered features for model training.