# Customer Feature Engineering (Memory Optimized)

This notebook uses the CustomerFeatureEngineer module to create comprehensive customer features with memory-efficient processing for large datasets. Features are created incrementally and saved to `data/features/final`.


In [None]:
# Ensure project root is the working directory
import os
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
print('Working directory:', os.getcwd())

In [None]:
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import logging
import warnings
import gc  # For garbage collection
warnings.filterwarnings('ignore')

# Import our feature engineering module
from hnm_data_analysis.feature_engineering.customer_features import CustomerFeatureEngineer

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print('Imports successful!')

In [None]:
# Check available memory and dataset sizes
import psutil

def check_memory_status():
    memory = psutil.virtual_memory()
    print(f"Available memory: {memory.available / (1024**3):.2f} GB")
    print(f"Memory usage: {memory.percent}%")
    return memory.available / (1024**3)

def check_file_sizes():
    files = {
        'customers': 'data/cleaned/customers_last_3_months_cleaned.parquet',
        'transactions': 'data/cleaned/transactions_last_3_months_cleaned.parquet',
        'articles': 'data/cleaned/articles_last_3_months_cleaned.parquet'
    }
    
    for name, path in files.items():
        if Path(path).exists():
            size_mb = Path(path).stat().st_size / (1024*1024)
            print(f"{name}: {size_mb:.1f} MB")
        else:
            print(f"{name}: FILE NOT FOUND")

print("System Memory Status:")
available_gb = check_memory_status()

print("\nDataset File Sizes:")
check_file_sizes()

# Determine if we need to use sampling
USE_SAMPLING = available_gb < 8.0
SAMPLE_SIZE = 50000 if USE_SAMPLING else None

if USE_SAMPLING:
    print(f"\n⚠️ Limited memory detected. Using sampling with {SAMPLE_SIZE:,} customers for development.")
else:
    print("\n✅ Sufficient memory available for full dataset processing.")

In [None]:
# Define data paths and verify files exist
customers_path = 'data/cleaned/customers_last_3_months_cleaned.parquet'
transactions_path = 'data/cleaned/transactions_last_3_months_cleaned.parquet'
articles_path = 'data/cleaned/articles_last_3_months_cleaned.parquet'

# Verify files exist
for path in [customers_path, transactions_path, articles_path]:
    if not Path(path).exists():
        raise FileNotFoundError(f"Required file not found: {path}")
        
print('✅ All required data files found!')

# Set analysis date
analysis_date = datetime(2020, 9, 22)
print(f'Analysis date: {analysis_date.date()}')

## Step 1: Create Core Features Using CustomerFeatureEngineer

We'll start with the proven CustomerFeatureEngineer module to create RFM analysis, product preferences, demographics, and shopping metrics.


In [None]:
# Initialize feature engineer
feature_engineer = CustomerFeatureEngineer(analysis_date=analysis_date)

print('Creating core customer features...')
print('This may take a few minutes for large datasets...')

# Create core features with memory management
try:
    customer_features = feature_engineer.create_customer_features(
        customers_path=customers_path,
        transactions_path=transactions_path,
        articles_path=articles_path,
        output_dir='data/features/intermediate'
    )
    
    print(f'\n✅ Core features created successfully!')
    print(f'Dataset shape: {customer_features.shape}')
    print(f'Memory usage after core features: {psutil.virtual_memory().percent}%')
    
except Exception as e:
    print(f"❌ Error creating core features: {e}")
    print("This might be due to memory constraints. Try restarting the kernel and running with sampling.")
    raise

In [None]:
# Quick examination of core features
print('Core Feature Summary:')
print(f'Customers: {customer_features.height:,}')
print(f'Features: {len(customer_features.columns)}')

# Show sample of key features
key_columns = ['customer_id', 'age_group', 'rfm_segment', 'total_spent', 'transaction_count']
available_columns = [col for col in key_columns if col in customer_features.columns]
print(f'\nSample data:')
print(customer_features.select(available_columns).head(3))

# Force garbage collection
gc.collect()
print(f'\nMemory after cleanup: {psutil.virtual_memory().percent}%')

## Step 2: Add Complementary Features (Memory Efficient)

Now we'll add complementary features in smaller batches to avoid memory issues.


In [None]:
# Load datasets efficiently for additional features
print('Loading datasets for additional feature engineering...')

# Load with memory-conscious approach
customers = pl.scan_parquet(customers_path).collect()
print(f'Customers loaded: {customers.shape}')

# Sample transactions if needed for memory management
if USE_SAMPLING:
    print(f'Sampling transactions for memory efficiency...')
    # Get list of customer IDs from our features dataset
    customer_ids = customer_features.select('customer_id').to_series().to_list()
    
    transactions = (
        pl.scan_parquet(transactions_path)
        .filter(pl.col('customer_id').is_in(customer_ids))
        .collect()
    )
else:
    transactions = pl.scan_parquet(transactions_path).collect()

print(f'Transactions loaded: {transactions.shape}')

# Load articles (smaller dataset)
articles = pl.scan_parquet(articles_path).collect()
print(f'Articles loaded: {articles.shape}')

print(f'Memory after loading: {psutil.virtual_memory().percent}%')

In [None]:
# Create temporal features (lightweight)
def create_temporal_features(transactions_df):
    """Create temporal and seasonality features efficiently."""
    print('Creating temporal features...')
    
    temporal_features = (
        transactions_df
        .with_columns([
            pl.col('t_dat').dt.month().alias('month'),
            pl.col('t_dat').dt.weekday().alias('weekday'),
            (pl.col('t_dat').dt.weekday().is_in([6, 7])).alias('is_weekend')
        ])
        .group_by('customer_id')
        .agg([
            # Simple temporal metrics
            pl.col('month').mode().first().alias('dominant_month'),
            pl.col('is_weekend').mean().alias('weekend_ratio'),
            pl.col('weekday').std().alias('weekday_consistency'),
            
            # Purchase timing
            (pl.col('t_dat').max() - pl.col('t_dat').min()).dt.total_days().alias('purchase_span')
        ])
        .with_columns([
            # Classifications
            pl.when(pl.col('weekend_ratio') > 0.6).then(pl.lit('Weekend_Shopper'))
            .when(pl.col('weekend_ratio') < 0.2).then(pl.lit('Weekday_Shopper'))
            .otherwise(pl.lit('Mixed_Shopper')).alias('shopping_time_type')
        ])
    )
    
    print(f'Temporal features created for {temporal_features.height:,} customers')
    return temporal_features

temporal_features = create_temporal_features(transactions)
gc.collect()  # Clean up memory

In [None]:
# Create lifecycle features
def create_lifecycle_features(transactions_df, analysis_date):
    """Create customer lifecycle features efficiently."""
    print('Creating customer lifecycle features...')
    
    lifecycle_features = (
        transactions_df
        .group_by('customer_id')
        .agg([
            pl.col('t_dat').min().alias('first_purchase'),
            pl.col('t_dat').max().alias('last_purchase'),
            pl.len().alias('total_purchases'),
            pl.col('t_dat').n_unique().alias('active_days')
        ])
        .with_columns([
            (pl.lit(analysis_date).cast(pl.Date) - pl.col('first_purchase')).dt.total_days().alias('tenure_days'),
            (pl.lit(analysis_date).cast(pl.Date) - pl.col('last_purchase')).dt.total_days().alias('recency_days'),
            (pl.col('total_purchases') / pl.col('active_days')).alias('purchase_intensity')
        ])
        .with_columns([
            # Lifecycle stage
            pl.when(pl.col('tenure_days') < 30).then(pl.lit('New'))
            .when(pl.col('recency_days') > 60).then(pl.lit('Inactive'))
            .when(pl.col('total_purchases') >= 10).then(pl.lit('Loyal'))
            .when(pl.col('total_purchases') >= 3).then(pl.lit('Regular'))
            .otherwise(pl.lit('Occasional')).alias('lifecycle_stage'),
            
            # Engagement level
            pl.when(pl.col('purchase_intensity') >= 1.5).then(pl.lit('High'))
            .when(pl.col('purchase_intensity') >= 0.5).then(pl.lit('Medium'))
            .otherwise(pl.lit('Low')).alias('engagement_level')
        ])
        .select(['customer_id', 'tenure_days', 'recency_days', 'purchase_intensity', 
                'lifecycle_stage', 'engagement_level'])
    )
    
    print(f'Lifecycle features created for {lifecycle_features.height:,} customers')
    return lifecycle_features

lifecycle_features = create_lifecycle_features(transactions, analysis_date)
gc.collect()

In [None]:
# Create channel features
def create_channel_features(transactions_df):
    """Create sales channel behavior features."""
    print('Creating channel behavior features...')
    
    channel_features = (
        transactions_df
        .group_by('customer_id')
        .agg([
            pl.col('sales_channel_id').n_unique().alias('channels_used'),
            (pl.col('sales_channel_id') == 1).sum().alias('online_transactions'),
            pl.len().alias('total_transactions')
        ])
        .with_columns([
            (pl.col('online_transactions') / pl.col('total_transactions')).alias('online_ratio'),
            (pl.col('channels_used') > 1).alias('is_omnichannel')
        ])
        .with_columns([
            pl.when(pl.col('online_ratio') >= 0.8).then(pl.lit('Online_Focused'))
            .when(pl.col('online_ratio') <= 0.2).then(pl.lit('Store_Focused'))
            .when(pl.col('is_omnichannel')).then(pl.lit('Omnichannel'))
            .otherwise(pl.lit('Mixed')).alias('channel_preference')
        ])
        .select(['customer_id', 'channels_used', 'online_ratio', 'is_omnichannel', 'channel_preference'])
    )
    
    print(f'Channel features created for {channel_features.height:,} customers')
    return channel_features

channel_features = create_channel_features(transactions)
gc.collect()

## Step 3: Combine Features and Create Final Dataset

Now we'll carefully combine all feature sets and create the final clean dataset.


In [None]:
# Combine all features step by step
print('Combining all feature sets...')

# Start with core features
print(f'Starting with core features: {customer_features.shape}')
final_features = customer_features

# Add temporal features
print('Adding temporal features...')
final_features = final_features.join(temporal_features, on='customer_id', how='left')
print(f'After temporal: {final_features.shape}')

# Add lifecycle features  
print('Adding lifecycle features...')
final_features = final_features.join(lifecycle_features, on='customer_id', how='left')
print(f'After lifecycle: {final_features.shape}')

# Add channel features
print('Adding channel features...')
final_features = final_features.join(channel_features, on='customer_id', how='left')
print(f'After channel: {final_features.shape}')

print(f'\n✅ All features combined!')
print(f'Final dataset shape: {final_features.shape}')
print(f'Total features: {len(final_features.columns) - 1}')  # Subtract customer_id

# Clean up intermediate variables
del temporal_features, lifecycle_features, channel_features
del transactions, customers, articles
gc.collect()
print(f'Memory after cleanup: {psutil.virtual_memory().percent}%')

In [None]:
# Clean up the dataset - remove cleaning flags and handle nulls
print('Cleaning up final dataset...')

# Check for cleaning flag columns to remove
cleaning_flags = [col for col in final_features.columns if col.endswith('_imputed') or col.endswith('_corrected')]
other_cleanup_cols = ['data_completeness_score'] if 'data_completeness_score' in final_features.columns else []

columns_to_remove = cleaning_flags + other_cleanup_cols
print(f'Removing columns: {columns_to_remove}')

if columns_to_remove:
    final_features_clean = final_features.drop(columns_to_remove)
else:
    final_features_clean = final_features

print(f'After cleanup: {final_features_clean.shape}')

# Handle null values efficiently
print('Handling null values...')
null_counts = final_features_clean.null_count()
has_nulls = False
for col in null_counts.columns:
    null_count = null_counts.select(pl.col(col)).item()
    if null_count > 0:
        print(f'  {col}: {null_count:,} nulls')
        has_nulls = True

if has_nulls:
    # Fill nulls strategically
    final_features_clean = final_features_clean.with_columns([
        # Numeric columns get 0
        pl.all().fill_null(0).exclude(["customer_id", pl.Utf8, pl.Categorical])
    ])
    
    # String/categorical columns get 'Unknown'
    string_cols = [col for col in final_features_clean.columns 
                   if col != 'customer_id' and final_features_clean.schema[col] in [pl.Utf8, pl.Categorical]]
    
    for col in string_cols:
        final_features_clean = final_features_clean.with_columns([
            pl.col(col).fill_null('Unknown')
        ])
    
    print('✅ Null values handled')
else:
    print('✅ No null values found')

print(f'Final clean dataset: {final_features_clean.shape}')

In [None]:
# Display final dataset summary
print('=== FINAL CUSTOMER FEATURES DATASET SUMMARY ===')
print(f'Dataset shape: {final_features_clean.shape}')
print(f'Customers: {final_features_clean.height:,}')
print(f'Features: {len(final_features_clean.columns) - 1}')

print('\nSample of key features:')
sample_cols = ['customer_id', 'age_group', 'rfm_segment', 'lifecycle_stage', 
               'channel_preference', 'shopping_time_type']
available_sample_cols = [col for col in sample_cols if col in final_features_clean.columns]
print(final_features_clean.select(available_sample_cols).head(5))

print('\nFeature types:')
feature_types = {}
for col in final_features_clean.columns:
    if col != 'customer_id':
        dtype = str(final_features_clean.schema[col])
        feature_types[dtype] = feature_types.get(dtype, 0) + 1

for dtype, count in feature_types.items():
    print(f'  {dtype}: {count} features')

## Step 4: Save Final Dataset

Save the comprehensive customer features to the final location with documentation.


In [None]:
# Create output directory
output_dir = Path('data/features/final')
output_dir.mkdir(parents=True, exist_ok=True)

# Define output paths
parquet_path = output_dir / 'customer_features_final.parquet'
csv_path = output_dir / 'customer_features_final.csv'

print(f'Saving final customer features dataset...')
print(f'Parquet: {parquet_path}')
print(f'CSV: {csv_path}')

try:
    # Save as Parquet (primary format)
    final_features_clean.write_parquet(parquet_path)
    print('✅ Parquet file saved successfully')
    
    # Save as CSV (for compatibility)
    final_features_clean.write_csv(csv_path)
    print('✅ CSV file saved successfully')
    
    # Display file sizes
    parquet_size = parquet_path.stat().st_size / (1024*1024)  # MB
    csv_size = csv_path.stat().st_size / (1024*1024)  # MB
    
    print(f'\nFile sizes:')
    print(f'  Parquet: {parquet_size:.2f} MB')
    print(f'  CSV: {csv_size:.2f} MB')
    print(f'  Compression ratio: {csv_size/parquet_size:.1f}x')
    
except Exception as e:
    print(f'❌ Error saving files: {e}')
    # Try saving just Parquet if CSV fails due to memory
    try:
        final_features_clean.write_parquet(parquet_path)
        print('✅ Parquet file saved (CSV skipped due to memory constraints)')
    except Exception as e2:
        print(f'❌ Critical error: {e2}')
        raise

In [None]:
# Create feature documentation
print('Creating feature documentation...')

documentation_path = output_dir / 'customer_features_documentation.txt'

with open(documentation_path, 'w', encoding='utf-8') as f:
    f.write('CUSTOMER FEATURES DATASET DOCUMENTATION\n')
    f.write('='*50 + '\n\n')
    f.write(f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')
    f.write(f'Analysis Date: {analysis_date.strftime("%Y-%m-%d")}\n')
    f.write(f'Dataset Shape: {final_features_clean.shape}\n')
    f.write(f'Total Features: {len(final_features_clean.columns) - 1}\n')
    
    if USE_SAMPLING:
        f.write(f'Note: Dataset created with sampling due to memory constraints\n')
    
    f.write('\n\nFEATURE CATEGORIES:\n')
    f.write('-' * 20 + '\n')
    
    categories = {
        'Demographics': ['age', 'age_group', 'life_stage', 'club_member', 'fashion_engagement'],
        'RFM Analysis': ['recency', 'frequency', 'monetary', 'rfm_score', 'rfm_segment'],
        'Product Preferences': ['preferred_', 'diversity', 'color', 'department'],
        'Shopping Behavior': ['transaction_count', 'total_spent', 'avg_', 'purchase_'],
        'Temporal Patterns': ['month', 'weekend', 'weekday', 'seasonal'],
        'Customer Lifecycle': ['lifecycle_stage', 'engagement_level', 'tenure', 'recency'],
        'Channel Behavior': ['channel', 'online', 'omnichannel', 'store']
    }
    
    for category, keywords in categories.items():
        matching_cols = [col for col in final_features_clean.columns 
                        if col != 'customer_id' and any(kw in col.lower() for kw in keywords)]
        f.write(f'{category}: {len(matching_cols)} features\n')
        if matching_cols[:3]:  # Show first 3 examples
            f.write(f'  Examples: {", ".join(matching_cols[:3])}\n')
        f.write('\n')
    
    f.write('\nALL FEATURES:\n')
    f.write('-' * 15 + '\n')
    for col in sorted(final_features_clean.columns):
        if col != 'customer_id':
            f.write(f'{col}\n')
    
    f.write('\n\nUSAGE NOTES:\n')
    f.write('-' * 12 + '\n')
    f.write('- Ready for customer segmentation and clustering analysis\n')
    f.write('- Suitable for ML algorithms (K-means, DBSCAN, etc.)\n')
    f.write('- All cleaning flags removed for production use\n')
    f.write('- Null values handled appropriately\n')
    f.write('- Both numeric and categorical features included\n')

print(f'📝 Documentation saved to: {documentation_path}')

# Final success message
print('\n🎉 Customer feature engineering completed successfully!')
print(f'\nFinal dataset ready for analysis: {parquet_path}')
print(f'Features created: {len(final_features_clean.columns) - 1}')
print(f'Customers processed: {final_features_clean.height:,}')

if USE_SAMPLING:
    print('\n⚠️ Note: This was a sampled run due to memory constraints.')
    print('For full dataset processing, ensure sufficient memory is available.')