# BNPL Feature Engineering Strategy

**Objective**: Create predictive features for BNPL default risk using only available data

**Critical Constraints**:
- No payment behavior data available (major limitation acknowledged)
- Features must be computable at transaction time (<100ms)
- Beat current 3.5x risk discrimination baseline

**Available Data Sources**: Transaction records with customer demographics and transaction context

In [1]:
# Add project root to Python path
# Allows us to use flit-ml modules directly in notebooks
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
print(f"Project root added to path: {project_root}")

Project root added to path: /Users/kevin/Documents/repos/flit-ml


In [2]:
# Environment setup
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# BigQuery integration
from google.cloud import bigquery
from flit_ml.config import config

# Configuration
pd.set_option('display.max_columns', 50)
sns.set_style("whitegrid")

print("Feature engineering environment ready!")

Feature engineering environment ready!


In [3]:
# Connect to BigQuery and load feature engineering dataset
client = config.get_client()

# Based on EDA findings, load key features for ML model development
# Focus on validated pre-transaction features that showed predictive power
feature_data_query = """
WITH customer_sample AS (
  SELECT DISTINCT unique_customer_id as customer_id
  FROM `flit-data-platform.flit_intermediate.int_bnpl_customer_tenure_adjusted`
  ORDER BY RAND()
  LIMIT 1000
)
SELECT 
    t.unique_customer_id as customer_id,
    t.transaction_id,
    t.amount,
    t.will_default,
    t.transaction_timestamp,
    t.days_to_first_missed_payment,
    
    -- EDA-validated customer features (legitimate pre-transaction)
    t.customer_credit_score_range,
    t.customer_age_bracket,
    t.customer_income_bracket,
    t.customer_verification_level,
    t.adjusted_customer_tenure as customer_tenure_days,
    t.customer_state,
    
    -- EDA-validated transaction context features
    t.product_category,
    t.product_risk_category,
    t.product_price,
    t.device_type,
    t.device_is_trusted,
    
    -- EDA-validated current underwriting features (baseline to beat)
    t.risk_score,
    t.risk_level,
    t.risk_scenario,
    
    -- Additional context features
    -- first_payment_amount is excluded as it is ideally a pct of amount
    -- payment_frequency is excluded as it is ideally a standard term (biweekly)
    t.payment_provider,
    t.installment_count,
    t.payment_credit_limit,
    t.payment_type,
    t.time_on_site_seconds,
    t.purchase_context,
    t.price_comparison_time
    
FROM `flit-data-platform.flit_intermediate.int_bnpl_customer_tenure_adjusted` t
INNER JOIN customer_sample cs ON t.unique_customer_id = cs.customer_id
ORDER BY t.unique_customer_id, t.transaction_timestamp
"""

print("📥 Loading feature engineering dataset based on EDA insights...")
df = client.query(feature_data_query).to_dataframe()

print(f"✅ Data loaded: {df.shape[0]:,} transactions for {df['customer_id'].nunique():,} customers")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"Default rate: {df['will_default'].mean():.1%}")

# Display schema for feature engineering reference
print(f"\n📋 Available columns for feature engineering:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2}. {col}")
    
print(f"\n🎯 Target variable: 'will_default' (Binary: {df['will_default'].value_counts().to_dict()})")

📥 Loading feature engineering dataset based on EDA insights...


E0000 00:00:1758618641.076854 24772345 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


✅ Data loaded: 4,063 transactions for 1,000 customers
Memory usage: 4.4 MB
Default rate: 4.8%

📋 Available columns for feature engineering:
 1. customer_id
 2. transaction_id
 3. amount
 4. will_default
 5. transaction_timestamp
 6. days_to_first_missed_payment
 7. customer_credit_score_range
 8. customer_age_bracket
 9. customer_income_bracket
10. customer_verification_level
11. customer_tenure_days
12. customer_state
13. product_category
14. product_risk_category
15. product_price
16. device_type
17. device_is_trusted
18. risk_score
19. risk_level
20. risk_scenario
21. payment_provider
22. installment_count
23. payment_credit_limit
24. payment_type
25. time_on_site_seconds
26. purchase_context
27. price_comparison_time

🎯 Target variable: 'will_default' (Binary: {False: 3868, True: 195})


In [4]:
# Data exploration: Understand what we actually have
print("📊 Available Fields for Feature Engineering:")
print("=" * 50)

for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique_vals = df[col].nunique() if df[col].nunique() < 10 else f"{df[col].nunique():,} unique values"
    print(f"{i:2}. {col:<30} | {str(dtype):<15} | {unique_vals}")

print(f"\n🎯 Target Variable: will_default")
print(f"   Distribution: {df['will_default'].value_counts().to_dict()}")
print(f"   Default rate: {df['will_default'].mean():.1%}")

📊 Available Fields for Feature Engineering:
 1. customer_id                    | object          | 1,000 unique values
 2. transaction_id                 | object          | 3,933 unique values
 3. amount                         | float64         | 3,717 unique values
 4. will_default                   | boolean         | 2
 5. transaction_timestamp          | datetime64[us, UTC] | 4,062 unique values
 6. days_to_first_missed_payment   | object          | 6
 7. customer_credit_score_range    | object          | 4
 8. customer_age_bracket           | object          | 5
 9. customer_income_bracket        | object          | 5
10. customer_verification_level    | object          | 3
11. customer_tenure_days           | float64         | 904 unique values
12. customer_state                 | object          | 59 unique values
13. product_category               | object          | 5
14. product_risk_category          | object          | 3
15. product_price                  | float64       

# Feature Engineering Strategy

## Data Reality Check
**What We DON'T Have** (Critical Gap):
- Payment history/behavior data
- Previous default information
- Account balance information
- Payment timing/lateness data

**What We DO Have**:
- Customer demographics at transaction time
- Transaction context and characteristics
- Current underwriting features (baseline)
- Device and behavioral signals

## Proposed Feature Engineering Steps

### Step 1: Temporal Feature Extraction
**What**: Extract time-based components from transaction_timestamp
**Why**: Temporal patterns often correlate with financial behavior:
- Hour of day (impulse vs planned purchases)
- Day of week (weekend vs weekday spending)
- Month/season (holiday spending, month-end behavior)
**Features**: hour, day_of_week, month, is_weekend, is_month_end, is_holiday_season

### Step 2: Categorical Variable Encoding
**What**: Convert categorical variables to numerical representations

**Why**: ML algorithms require numerical inputs

**Categorical Variables Identified**:

- customer_credit_score_range
- customer_age_bracket  
- customer_income_bracket
- customer_verification_level
- product_category
- product_risk_category
- device_type
- risk_level
- payment_provider
- purchase_context
**Method**: Ordinal encoding for ordered categories, one-hot for nominal

### Step 3: Customer Historical Aggregation Features
**What**: Create customer-level behavioral patterns from transaction history

**Why**: Even without payment data, transaction patterns reveal behavior:

- Spending consistency/volatility
- Purchase frequency patterns
- Product category preferences
- Device/channel preferences
**Features**: transaction_count, avg_amount, amount_volatility, category_diversity, device_consistency

### Step 4: Transaction Context Risk Indicators
**What**: Engineer risk signals from available transaction context

**Why**: Certain transaction characteristics correlate with default risk:

- High amounts relative to income/credit limit
- High-risk product categories
- Unverified customers
- Untrusted devices
**Features**: amount_to_income_ratio, amount_to_credit_ratio, risk_score_normalized

### Step 5: Interaction Features
**What**: Create features that capture relationships between variables

**Why**: Risk often emerges from combinations of factors

**Examples**: 

- young_customer_high_amount (age + amount interaction)
- unverified_high_risk_product (verification + product risk)
- mobile_impulse_purchase (device + context)

### Step 6: Feature Validation and Selection
**What**: Validate feature quality and remove redundant/low-value features

**Why**: Ensure features are predictive and production-ready

**Methods**: 

- Correlation analysis
- Feature importance
- Missing value analysis
- Distribution validation

## Expected Limitations
1. **No Payment Behavior**: Severely limits predictive power
2. **Cross-sectional Only**: No longitudinal payment performance
3. **Synthetic Data**: May not reflect real-world patterns
4. **Limited Risk Signals**: Current features may not capture default risk effectively

## Success Metrics
- Beat current 3.5x risk discrimination ratio
- Achieve >40% precision on high-risk segment
- Maintain feature computation <100ms
- Demonstrate statistical significance vs baseline

## Step 1: Temporal Feature Extraction

Extract time-based components from transaction_timestamp to capture temporal patterns.

In [5]:
def extract_temporal_features(df):
    """
    Extract temporal features from transaction_timestamp.
    
    Business Logic:
    - Hour patterns: Late night/early morning may indicate impulsive behavior
    - Day patterns: Weekend vs weekday spending behaviors
    - Month patterns: Holiday seasons, month-end financial stress
    """
    df_tmp = df.copy()
    
    # Ensure transaction_timestamp is datetime
    df_tmp['transaction_timestamp'] = pd.to_datetime(df_tmp['transaction_timestamp'])
    
    # Extract basic temporal components
    df_tmp['transaction_hour'] = df_tmp['transaction_timestamp'].dt.hour
    df_tmp['transaction_day_of_week'] = df_tmp['transaction_timestamp'].dt.dayofweek  # 0=Monday, 6=Sunday
    df_tmp['transaction_month'] = df_tmp['transaction_timestamp'].dt.month
    df_tmp['transaction_day_of_month'] = df_tmp['transaction_timestamp'].dt.day
    
    # Create categorical temporal features
    df_tmp['is_weekend'] = df_tmp['transaction_day_of_week'].isin([5, 6]).astype(int)  # Saturday, Sunday
    df_tmp['is_month_end'] = (df_tmp['transaction_day_of_month'] >= 25).astype(int)  # Last week of month
    df_tmp['is_holiday_season'] = df_tmp['transaction_month'].isin([11, 12]).astype(int)  # Nov, Dec
    df_tmp['is_business_hours'] = df_tmp['transaction_hour'].between(9, 17).astype(int)  # 9 AM - 5 PM
    df_tmp['week_of_month'] = ((df_tmp['transaction_day_of_month'] - 1) // 7) + 1  # 1-5
    df_tmp['is_late_night'] = df_tmp['transaction_hour'].isin([22, 23, 0, 1, 2, 3, 4, 5]).astype(int)  # 10 PM - 5 AM
    
    # Time of day categories
    df_tmp['time_of_day'] = pd.cut(
        df_tmp['transaction_hour'], 
        bins=[-1, 6, 12, 18, 24], 
        labels=['night', 'morning', 'afternoon', 'evening']
    )
    
    print(f"✅ Temporal features extracted:")
    temporal_cols = [col for col in df_tmp.columns if col.startswith(('transaction_', 'is_', 'time_of_day'))]
    for col in temporal_cols:
        unique_vals = df_tmp[col].nunique()
        print(f"   {col}: {unique_vals} unique values")
    
    return df_tmp

In [6]:
# Apply temporal feature extraction
print("⏰ Extracting temporal features...")
df_temporal = extract_temporal_features(df)

# Quick validation of temporal features
print(f"\n📊 Temporal Feature Distribution:")
print(f"Weekend transactions: {df_temporal['is_weekend'].mean():.1%}")
print(f"Month-end transactions: {df_temporal['is_month_end'].mean():.1%}")
print(f"Holiday season transactions: {df_temporal['is_holiday_season'].mean():.1%}")
print(f"Business hours transactions: {df_temporal['is_business_hours'].mean():.1%}")
print(f"Late night transactions: {df_temporal['is_late_night'].mean():.1%}")

# Show sample of temporal features
print(f"\n📋 Sample temporal features:")
temporal_sample = df_temporal[['transaction_timestamp', 'transaction_hour', 'transaction_day_of_week', 
                     'is_weekend', 'is_month_end', 'time_of_day']].head()
temporal_sample

⏰ Extracting temporal features...
✅ Temporal features extracted:
   transaction_id: 3933 unique values
   transaction_timestamp: 4062 unique values
   transaction_hour: 24 unique values
   transaction_day_of_week: 7 unique values
   transaction_month: 12 unique values
   transaction_day_of_month: 31 unique values
   is_weekend: 2 unique values
   is_month_end: 2 unique values
   is_holiday_season: 2 unique values
   is_business_hours: 2 unique values
   is_late_night: 2 unique values
   time_of_day: 4 unique values

📊 Temporal Feature Distribution:
Weekend transactions: 20.4%
Month-end transactions: 16.7%
Holiday season transactions: 19.3%
Business hours transactions: 69.4%
Late night transactions: 10.8%

📋 Sample temporal features:


Unnamed: 0,transaction_timestamp,transaction_hour,transaction_day_of_week,is_weekend,is_month_end,time_of_day
0,2025-04-18 05:44:57+00:00,5,4,0,0,night
1,2024-09-08 21:55:45+00:00,21,6,1,0,evening
2,2024-09-23 17:20:27+00:00,17,0,0,0,afternoon
3,2024-09-24 19:06:04+00:00,19,1,0,0,evening
4,2024-09-26 13:52:06+00:00,13,3,0,1,afternoon


In [7]:
# Inspect cols
df_temporal.head()

Unnamed: 0,customer_id,transaction_id,amount,will_default,transaction_timestamp,days_to_first_missed_payment,customer_credit_score_range,customer_age_bracket,customer_income_bracket,customer_verification_level,customer_tenure_days,customer_state,product_category,product_risk_category,product_price,device_type,device_is_trusted,risk_score,risk_level,risk_scenario,payment_provider,installment_count,payment_credit_limit,payment_type,time_on_site_seconds,purchase_context,price_comparison_time,transaction_hour,transaction_day_of_week,transaction_month,transaction_day_of_month,is_weekend,is_month_end,is_holiday_season,is_business_hours,week_of_month,is_late_night,time_of_day
0,001fdb72f977946700300a9f0b128674,txn_00048205,120.01,False,2025-04-18 05:44:57+00:00,,good,18-24,<25k,verified,809.0,HI,clothing,medium,120.01,mobile,True,0.39,medium,credit_stretched,afterpay,6.0,1000.0,bnpl_account,380,normal,191.0,5,4,4,18,0,0,0,0,3,1,night
1,004b2eefb6066ebd682dd992eaa6c76f,txn_00056059,207.28,False,2024-09-08 21:55:45+00:00,,poor,35-44,50k-75k,unverified,390.0,NY,electronics,medium,207.28,mobile,True,0.55,medium,low_risk_purchase,klarna,6.0,1500.0,bnpl_account,405,normal,109.0,21,6,9,8,1,0,0,0,2,0,evening
2,006f4ece53a086769251a1db24ba43c8,txn_00011418,72.63,False,2024-09-23 17:20:27+00:00,,fair,25-34,100k+,unverified,503.0,NE,beauty,medium,72.63,mobile,True,0.42,medium,low_risk_purchase,afterpay,4.0,500.0,bnpl_account,822,normal,135.0,17,0,9,23,0,0,0,1,4,0,afternoon
3,006f4ece53a086769251a1db24ba43c8,txn_00017328,346.26,False,2024-09-24 19:06:04+00:00,,fair,25-34,100k+,unverified,504.0,NE,electronics,medium,346.26,desktop,True,0.42,medium,low_risk_purchase,afterpay,6.0,500.0,bnpl_account,610,normal,159.0,19,1,9,24,0,0,0,0,4,0,evening
4,006f4ece53a086769251a1db24ba43c8,txn_00024634,301.05,False,2024-09-26 13:52:06+00:00,,fair,25-34,100k+,unverified,506.0,NE,electronics,low,301.05,desktop,True,0.4,medium,low_risk_purchase,afterpay,4.0,500.0,bnpl_account,263,normal,67.0,13,3,9,26,0,1,0,1,4,0,afternoon


## Step 2: Categorical Variable Encoding

Convert categorical variables to numerical representations using sklearn encoders.

In [8]:
def encode_categorical_features(df):
    """
    Encode categorical variables using appropriate sklearn encoders.
    
    Strategy:
    - Ordinal encoding for ordered categories (preserves ranking)
    - One-hot encoding for nominal categories (no inherent order)
    """
    
    df_encoded = df.copy()
    
    # Define ordinal mappings (order matters!)
    ordinal_mappings = {
        'customer_credit_score_range': ['poor', 'fair', 'good', 'excellent'],
        'customer_age_bracket': ['18-24', '25-34', '35-44', '45-54', '55+'],
        'customer_income_bracket': ['<25k', '25k-50k', '50k-75k', '75k-100k', '100k+'],
        'customer_verification_level': ['unverified', 'partial', 'verified'],
        'product_risk_category': ['low', 'medium', 'high'],
        'risk_level': ['low', 'medium', 'high']
    }
    
    # Apply ordinal encoding
    print("🔢 Applying ordinal encoding...")
    for feature, categories in ordinal_mappings.items():
        if feature in df_encoded.columns:
            # Check if all expected categories exist
            actual_categories = df_encoded[feature].unique()
            print(f"   {feature}: {actual_categories}")
            
            # Create ordinal encoder
            ordinal_encoder = OrdinalEncoder(categories=[categories], handle_unknown='use_encoded_value', unknown_value=-1)
            
            # Apply encoding
            encoded_values = ordinal_encoder.fit_transform(df_encoded[[feature]])
            df_encoded[f"{feature}_encoded"] = encoded_values.flatten()
    
    # Define nominal features for one-hot encoding
    nominal_features = [
        'device_type', 'payment_provider', 'product_category', 
        'purchase_context', 'risk_scenario', 'time_of_day'
    ]
    
    # Check cardinality before one-hot encoding
    print(f"\n🏷️  Checking cardinality for one-hot encoding...")
    for feature in nominal_features:
        if feature in df_encoded.columns:
            cardinality = df_encoded[feature].nunique()
            unique_vals = df_encoded[feature].unique()
            print(f"   {feature}: {cardinality} unique values {unique_vals}")
    
    # Apply one-hot encoding
    print(f"\n🎯 Applying one-hot encoding...")
    for feature in nominal_features:
        if feature in df_encoded.columns:
            # Create OneHot encoder
            onehot_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
            # Fit and transform
            encoded_values = onehot_encoder.fit_transform(df_encoded[[feature]])
            # Get feature names
            feature_names = onehot_encoder.get_feature_names_out([feature])
            # Add encoded columns to dataframe
            encoded_df = pd.DataFrame(encoded_values, columns=feature_names, index=df_encoded.index)
            df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

            print(f"   {feature}: Created {len(feature_names)} dummy variables")
    
    # Boolean features are already encoded (device_is_trusted: True/False → 1/0)
    print(f"\n✅ Boolean features already encoded:")
    boolean_features = df_encoded.select_dtypes(include=['bool']).columns
    for feature in boolean_features:
        print(f"   {feature}: {df_encoded[feature].dtype}")
    
    print(f"\n📊 Encoding Summary:")
    original_categorical = len([col for col in df.columns if df[col].dtype == 'object'])
    new_encoded_features = len([col for col in df_encoded.columns if col.endswith('_encoded') or '_' in col and col not in df.columns])
    print(f"   Original categorical features: {original_categorical}")
    print(f"   New encoded features created: {new_encoded_features}")
    print(f"   Total features: {len(df_encoded.columns)}")
    
    return df_encoded

In [9]:
# Apply categorical encoding
print("🔧 Encoding categorical features...")
df_encoded = encode_categorical_features(df_temporal)

# Show sample of encoded features
print(f"\n📋 Sample of encoded features:")
encoded_cols = [col for col in df_encoded.columns if '_encoded' in col or col.startswith(('device_type_', 'product_category_', 'payment_provider_'))][:10]
if encoded_cols:
    df_encoded[encoded_cols].head()
else:
    print("No encoded columns found yet")

🔧 Encoding categorical features...
🔢 Applying ordinal encoding...
   customer_credit_score_range: ['good' 'poor' 'fair' 'excellent']
   customer_age_bracket: ['18-24' '35-44' '25-34' '55+' '45-54']
   customer_income_bracket: ['<25k' '50k-75k' '100k+' '25k-50k' '75k-100k']
   customer_verification_level: ['verified' 'unverified' 'partial']
   product_risk_category: ['medium' 'low' 'high']
   risk_level: ['medium' 'low' 'high']

🏷️  Checking cardinality for one-hot encoding...
   device_type: 3 unique values ['mobile' 'desktop' 'tablet']
   payment_provider: 4 unique values ['afterpay' 'klarna' 'sezzle' 'affirm']
   product_category: 5 unique values ['clothing' 'electronics' 'beauty' 'sports' 'home']
   purchase_context: 3 unique values ['normal' 'rushed' 'impulse']
   risk_scenario: 4 unique values ['credit_stretched' 'low_risk_purchase' 'high_risk_behavior'
 'impulse_purchase']
   time_of_day: 4 unique values ['night', 'evening', 'afternoon', 'morning']
Categories (4, object): ['night

In [10]:
# Inspect cols
df_encoded.head(5)


Unnamed: 0,customer_id,transaction_id,amount,will_default,transaction_timestamp,days_to_first_missed_payment,customer_credit_score_range,customer_age_bracket,customer_income_bracket,customer_verification_level,customer_tenure_days,customer_state,product_category,product_risk_category,product_price,device_type,device_is_trusted,risk_score,risk_level,risk_scenario,payment_provider,installment_count,payment_credit_limit,payment_type,time_on_site_seconds,...,is_late_night,time_of_day,customer_credit_score_range_encoded,customer_age_bracket_encoded,customer_income_bracket_encoded,customer_verification_level_encoded,product_risk_category_encoded,risk_level_encoded,device_type_mobile,device_type_tablet,payment_provider_afterpay,payment_provider_klarna,payment_provider_sezzle,product_category_clothing,product_category_electronics,product_category_home,product_category_sports,purchase_context_normal,purchase_context_rushed,risk_scenario_high_risk_behavior,risk_scenario_impulse_purchase,risk_scenario_low_risk_purchase,time_of_day_evening,time_of_day_morning,time_of_day_night
0,001fdb72f977946700300a9f0b128674,txn_00048205,120.01,False,2025-04-18 05:44:57+00:00,,good,18-24,<25k,verified,809.0,HI,clothing,medium,120.01,mobile,True,0.39,medium,credit_stretched,afterpay,6.0,1000.0,bnpl_account,380,...,1,night,2.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,004b2eefb6066ebd682dd992eaa6c76f,txn_00056059,207.28,False,2024-09-08 21:55:45+00:00,,poor,35-44,50k-75k,unverified,390.0,NY,electronics,medium,207.28,mobile,True,0.55,medium,low_risk_purchase,klarna,6.0,1500.0,bnpl_account,405,...,0,evening,0.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,006f4ece53a086769251a1db24ba43c8,txn_00011418,72.63,False,2024-09-23 17:20:27+00:00,,fair,25-34,100k+,unverified,503.0,NE,beauty,medium,72.63,mobile,True,0.42,medium,low_risk_purchase,afterpay,4.0,500.0,bnpl_account,822,...,0,afternoon,1.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,006f4ece53a086769251a1db24ba43c8,txn_00017328,346.26,False,2024-09-24 19:06:04+00:00,,fair,25-34,100k+,unverified,504.0,NE,electronics,medium,346.26,desktop,True,0.42,medium,low_risk_purchase,afterpay,6.0,500.0,bnpl_account,610,...,0,evening,1.0,1.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,006f4ece53a086769251a1db24ba43c8,txn_00024634,301.05,False,2024-09-26 13:52:06+00:00,,fair,25-34,100k+,unverified,506.0,NE,electronics,low,301.05,desktop,True,0.4,medium,low_risk_purchase,afterpay,4.0,500.0,bnpl_account,263,...,0,afternoon,1.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
len(df.columns)

27

In [12]:
len(df_temporal.columns)

38

In [13]:
len(df_encoded.columns)

61

In [14]:
def clean_and_select_features(df):
    """
    Clean data and select optimal feature set for production deployment.
    
    Strategy:
    1. Remove redundant features (original categorical after encoding)
    2. Drop identifier fields (not predictive)
    3. Remove low-value temporal components (keep engineered features)
    4. Handle problematic fields (data leakage, high cardinality)
    5. Address missing values appropriately
    """

    df_clean = df.copy()

    print("🧹 Starting data cleaning and feature selection...")
    print(f"   Starting features: {len(df_clean.columns)}")

    # 1. Remove redundant original categorical features (keep encoded versions)
    redundant_categorical = [
        'customer_credit_score_range',  # Keep: customer_credit_score_range_encoded
        'customer_age_bracket',         # Keep: customer_age_bracket_encoded  
        'customer_income_bracket',      # Keep: customer_income_bracket_encoded
        'customer_verification_level',  # Keep: customer_verification_level_encoded
        'product_risk_category',        # Keep: product_risk_category_encoded
        'risk_level',                   # Keep: risk_level_encoded
        'device_type',                  # Keep: device_type_* dummies
        'payment_provider',             # Keep: payment_provider_* dummies
        'product_category',             # Keep: product_category_* dummies
        'purchase_context',             # Keep: purchase_context_* dummies
        'risk_scenario',                # Keep: risk_scenario_* dummies
        'time_of_day'                   # Keep: time_of_day_* dummies
    ]

    # 2. Remove identifier fields (not predictive)
    identifier_fields = [
        'customer_id',                  # Just identifier
        'transaction_id',                # Just identifier
        'record_id',
        'unique_transaction_id',
        'unique_customer_id',
        'first_customer_transaction_date',
        'first_customer_tenure',
        'days_from_first_transaction',
        'device_id',
        'product_id',
        'transaction_timestamp',  # Keep derived temporal features, drop raw
        'ingestion_timestamp',  # Metadata, not predictive


        
        
    ]


    # 3. Remove raw temporal components (keep engineered features)
    raw_temporal = [
        'transaction_timestamp',        # Keep derived temporal features
        'transaction_hour',             # Keep: is_late_night, is_business_hours
        'transaction_day_of_week',      # Keep: is_weekend
        'transaction_month',            # Keep: is_holiday_season
        'transaction_day_of_month'      # Keep: is_month_end, week_of_month
    ]

    # 4. Handle problematic fields
    problematic_fields = []

    # Check for data leakage indicators
    # if 'days_to_first_missed_payment' in df_clean.columns:
    #     unique_vals = df_clean['days_to_first_missed_payment'].unique()
    #     print(f"   ⚠️  days_to_first_missed_payment values: {unique_vals}")
    #     if len(unique_vals) > 1:  # If not all the same, might be leakage
    #         print(f"      → Potential data leakage detected. Consider removing.")
    #         problematic_fields.append('days_to_first_missed_payment')

    # Check for single-value features (no predictive power)
    single_value_features = []
    for col in df_clean.columns:
        if df_clean[col].nunique() <= 1:
            single_value_features.append(col)

    if single_value_features:
        print(f"   🗑️  Single-value features to remove: {single_value_features}")
        problematic_fields.extend(single_value_features)

    # Check high cardinality categorical features
    high_cardinality_features = []
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        cardinality = df_clean[col].nunique()
        if cardinality > 20:  # Threshold for high cardinality
            high_cardinality_features.append(col)
            print(f"   📊 High cardinality feature: {col} ({cardinality} values)")

    # 5. Plain unnecessary features or alraedy adjusted fields:
    unnecessary_fields = [
        '_generator',
        '_timestamp',
        'currency',
        'customer_state',
        'customer_tenure_days',
        'device_os',
        'payment_frequency',        # removing it for now, because we only have biweekly. Would be critical when we intro new terms
        'payment_method_id',
        'payment_type',
        'product_price',             # Not needed as we have amount
        'status',
        'json_body',
        'data_source',
        '_loaded_at',
        ]



    # Combine all features to drop
    features_to_drop = (redundant_categorical + identifier_fields +
                        raw_temporal + problematic_fields + high_cardinality_features + unnecessary_fields)

    # Remove duplicates and features that don't exist
    features_to_drop = list(set(features_to_drop))
    existing_features_to_drop = [f for f in features_to_drop if f in df_clean.columns]

    print(f"\n🗑️  Removing {len(existing_features_to_drop)} features:")
    for feature in existing_features_to_drop:
        print(f"   - {feature}")

    # Drop the features
    df_clean = df_clean.drop(columns=existing_features_to_drop)

    # 5. Handle missing values
    print(f"\n🔍 Checking for missing values...")
    missing_summary = df_clean.isnull().sum()
    missing_features = missing_summary[missing_summary > 0]

    if len(missing_features) > 0:
        print(f"   Missing values found:")
        for feature, count in missing_features.items():
            pct = (count / len(df_clean)) * 100
            print(f"   - {feature}: {count:,} ({pct:.1f}%)")

            # Handle missing values based on data type
            if df_clean[feature].dtype in ['float64', 'int64']:
                # Numeric: fill with median
                median_val = df_clean[feature].median()
                df_clean[feature].fillna(median_val, inplace=True)
                print(f"     → Filled with median: {median_val}")
            else:
                # Categorical: fill with mode or 'unknown'
                if df_clean[feature].mode().empty:
                    df_clean[feature].fillna('unknown', inplace=True)
                    print(f"     → Filled with: 'unknown'")
                else:
                    mode_val = df_clean[feature].mode().iloc[0]
                    df_clean[feature].fillna(mode_val, inplace=True)
                    print(f"     → Filled with mode: {mode_val}")
    else:
        print(f"   ✅ No missing values found")

    # 6. Convert string numeric fields to proper numeric types
    string_numeric_fields = ['time_on_site_seconds', 'price_comparison_time']
    for field in string_numeric_fields:
        if field in df_clean.columns and df_clean[field].dtype == 'object':
            df_clean[field] = pd.to_numeric(df_clean[field], errors='coerce')
            print(f"   🔢 Converted {field} to numeric")

    # 7. Final feature summary
    print(f"\n📊 Final Feature Set Summary:")
    print(f"   Final features: {len(df_clean.columns)}")
    print(f"   Features removed: {len(existing_features_to_drop)}")
    print(f"   Data shape: {df_clean.shape}")

    # Categorize remaining features
    numeric_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
    boolean_features = df_clean.select_dtypes(include=['bool']).columns.tolist()
    categorical_features = df_clean.select_dtypes(include=['object']).columns.tolist()

    print(f"\n📋 Feature Types:")
    print(f"   Numeric features: {len(numeric_features)}")
    print(f"   Boolean features: {len(boolean_features)}")
    print(f"   Categorical features: {len(categorical_features)}")

    if categorical_features:
        print(f"   ⚠️  Remaining categorical features may need encoding: {categorical_features}")

    # Save feature list for production deployment
    final_features = {
        'all_features': df_clean.columns.tolist(),
        'numeric_features': numeric_features,
        'boolean_features': boolean_features,
        'categorical_features': categorical_features,
        'target_variable': 'will_default'
    }

    return df_clean, final_features

In [15]:
# Apply data cleaning and feature selection
print("🔧 Cleaning data and selecting optimal feature set...")
df_final, feature_metadata = clean_and_select_features(df_encoded)

# Display sample of final dataset
print(f"\n📋 Sample of final feature set:")
print(f"Shape: {df_final.shape}")
df_final.head()

🔧 Cleaning data and selecting optimal feature set...
🧹 Starting data cleaning and feature selection...
   Starting features: 61
   🗑️  Single-value features to remove: ['payment_type']
   📊 High cardinality feature: customer_id (1000 values)
   📊 High cardinality feature: transaction_id (3933 values)
   📊 High cardinality feature: customer_state (59 values)
   📊 High cardinality feature: time_on_site_seconds (845 values)

🗑️  Removing 24 features:
   - time_of_day
   - risk_scenario
   - transaction_timestamp
   - payment_type
   - customer_state
   - product_category
   - customer_tenure_days
   - time_on_site_seconds
   - product_price
   - payment_provider
   - transaction_day_of_week
   - customer_verification_level
   - transaction_month
   - risk_level
   - customer_age_bracket
   - customer_id
   - product_risk_category
   - transaction_id
   - transaction_day_of_month
   - device_type
   - customer_credit_score_range
   - customer_income_bracket
   - purchase_context
   - trans

Unnamed: 0,amount,will_default,days_to_first_missed_payment,device_is_trusted,risk_score,installment_count,payment_credit_limit,price_comparison_time,is_weekend,is_month_end,is_holiday_season,is_business_hours,week_of_month,is_late_night,customer_credit_score_range_encoded,customer_age_bracket_encoded,customer_income_bracket_encoded,customer_verification_level_encoded,product_risk_category_encoded,risk_level_encoded,device_type_mobile,device_type_tablet,payment_provider_afterpay,payment_provider_klarna,payment_provider_sezzle,product_category_clothing,product_category_electronics,product_category_home,product_category_sports,purchase_context_normal,purchase_context_rushed,risk_scenario_high_risk_behavior,risk_scenario_impulse_purchase,risk_scenario_low_risk_purchase,time_of_day_evening,time_of_day_morning,time_of_day_night
0,120.01,False,28,True,0.39,6.0,1000.0,191.0,0,0,0,0,3,1,2.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,207.28,False,28,True,0.55,6.0,1500.0,109.0,1,0,0,0,2,0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,72.63,False,28,True,0.42,4.0,500.0,135.0,0,0,0,1,4,0,1.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,346.26,False,28,True,0.42,6.0,500.0,159.0,0,0,0,0,4,0,1.0,1.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,301.05,False,28,True,0.4,4.0,500.0,67.0,0,1,0,1,4,0,1.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
