# H&M Feature Engineering

This notebook performs comprehensiv feature engineering on the H&M dataset to extract relevant features for machine learning models. The engineered features focus on customer behaviour, product preferences, demographics, and temporal patterns.


## Import Libraries and Setup

Import necessary libraries for feature engineering and data processing.


In [17]:
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
import os
import warnings
import gc
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure Polars for memory efficiency
try:
    pl.Config.set_streaming_chunk_size(5000)  # Reduced chunk size
    pl.Config.set_fmt_str_lengths(50)
    if hasattr(pl.Config, 'set_table_width'):
        pl.Config.set_table_width(120)
except AttributeError:
    pass

# Memory monitoring function
def check_memory_usage():
    import psutil
    process = psutil.Process(os.getpid())
    memory_mb = process.memory_info().rss / 1024 / 1024
    print(f"Current memory usage: {memory_mb:.1f} MB")
    return memory_mb

print(f"Libraries imported successfully")
print(f"Polars version: {pl.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
check_memory_usage()

Libraries imported successfully
Polars version: 1.32.0
Pandas version: 2.2.0
NumPy version: 1.26.4
Current memory usage: 111.8 MB


111.75

## Load Dataset

Load the integrated H&M dataset that was created in the data exploration phase.


In [18]:
# Load integrated dataset with memory optimization
data_dir = '../data'
integrated_path = os.path.join(data_dir, 'processed', 'hm_integrated_dataset.parquet')

if os.path.exists(integrated_path):
    print("Loading integrated dataset with memory optimization...")
    
    # Load with lazy evaluation for memory efficiency
    df_integrated = pl.scan_parquet(integrated_path)
    
    # Get basic info without loading everything into memory
    print("Analyzing dataset structure...")
    schema_info = df_integrated.schema
    print(f"✓ Found {len(schema_info)} columns")
    
    # Sample to check data without loading full dataset
    sample_df = df_integrated.head(1000).collect()
    print(f"✓ Sample loaded: {sample_df.height:,} records")
    
    # Get estimated size
    full_df = df_integrated.collect()
    print(f"✓ Full dataset: {full_df.height:,} records with {len(full_df.columns)} columns")
    print(f"✓ Memory usage: {full_df.estimated_size('mb'):.1f} MB")
    
    # Free up the sample
    del sample_df
    gc.collect()
    
else:
    print("Integrated dataset not found. Please run the data exploration notebook first.")
    raise FileNotFoundError(f"File not found: {integrated_path}")

# Display basic information
print(f"\nDataset Overview:")
print(f"• Records: {full_df.height:,}")
print(f"• Unique customers: {full_df['customer_id'].n_unique():,}")
print(f"• Unique articles: {full_df['article_id'].n_unique():,}")

# Check date range for temporal features - optimized
if 't_dat' in full_df.columns:
    print(f"Sample dates: {full_df.select('t_dat').head(3).to_pandas()['t_dat'].tolist()}")
    
    # Get date range efficiently
    date_stats = full_df.select([
        pl.col('t_dat').str.to_date().min().alias('min_date'),
        pl.col('t_dat').str.to_date().max().alias('max_date')
    ]).to_pandas().iloc[0]
    
    print(f"• Date range: {date_stats['min_date']} to {date_stats['max_date']}")
    
    # Set reference date for recency calculations
    REFERENCE_DATE = pd.to_datetime(date_stats['max_date'])
    print(f"• Reference date for recency: {REFERENCE_DATE.date()}")

# Store the main dataframe for processing
df_integrated = full_df
del full_df
gc.collect()
check_memory_usage()

Loading integrated dataset with memory optimization...
Analyzing dataset structure...
✓ Found 35 columns
✓ Sample loaded: 1,000 records
✓ Full dataset: 3,178,832 records with 35 columns
✓ Memory usage: 1636.3 MB

Dataset Overview:
• Records: 3,178,832
• Unique customers: 822,211
• Unique articles: 86,988
Sample dates: ['2020-05-18', '2019-05-12', '2020-09-08']
• Date range: 2018-09-20 00:00:00 to 2020-09-22 00:00:00
• Reference date for recency: 2020-09-22
Current memory usage: 2168.4 MB


2168.40625

## Customer Behavioural Features (RFM Analysis)

Create features based on customer purchasing behaviour including Recency, Frequency, and Monetary value analysis.


In [19]:
print("Creating customer behavioural features with memory optimization...")

# Clear memory first
gc.collect()
initial_memory = check_memory_usage()

# Prepare temporal data efficiently - only select needed columns
df_temporal = df_integrated.select([
    'customer_id', 'article_id', 'price', 't_dat'
]).with_columns([
    pl.col('t_dat').str.to_date().alias('transaction_date')
])

print("Processing RFM features in batches...")

# Get unique customers and process in batches to manage memory
unique_customers = df_temporal.select('customer_id').unique().to_pandas()['customer_id'].tolist()
total_customers = len(unique_customers)
batch_size = 50000  # Process 50k customers at a time

print(f"Processing {total_customers:,} customers in batches of {batch_size:,}")

# Set reference date
reference_date = pd.to_datetime('2020-09-22')

# Initialize list to store batch results
rfm_batches = []

for i in range(0, total_customers, batch_size):
    batch_customers = unique_customers[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(total_customers-1)//batch_size + 1}: customers {i:,} to {min(i+batch_size, total_customers):,}")
    
    # Filter data for current batch
    batch_data = df_temporal.filter(pl.col('customer_id').is_in(batch_customers))
    
    # Calculate RFM for this batch
    batch_rfm = (
        batch_data
        .group_by('customer_id')
        .agg([
            # Recency: Days since last purchase
            ((pl.lit(reference_date) - pl.col('transaction_date').max()).dt.total_days()).alias('recency_days'),
            
            # Frequency: Number of transactions
            pl.col('transaction_date').count().alias('frequency'),
            
            # Monetary: Total amount spent
            pl.col('price').sum().alias('monetary_value'),
            
            # Additional behavioural metrics
            pl.col('price').mean().alias('avg_transaction_value'),
            pl.col('article_id').n_unique().alias('unique_products_purchased'),
            pl.col('price').std().alias('spending_variability'),
            
            # Temporal behaviour
            ((pl.col('transaction_date').max() - pl.col('transaction_date').min()).dt.total_days()).alias('customer_lifespan_days'),
            pl.col('transaction_date').min().alias('first_purchase_date'),
            pl.col('transaction_date').max().alias('last_purchase_date')
        ])
    )
    
    # Apply safe calculations
    batch_rfm = batch_rfm.with_columns([
        # Purchases per day (safely handling division by zero)
        pl.when(pl.col('customer_lifespan_days') > 0)
        .then(pl.col('frequency').cast(pl.Float64) / pl.col('customer_lifespan_days').cast(pl.Float64))
        .otherwise(pl.col('frequency').cast(pl.Float64))
        .alias('purchase_frequency_per_day'),
        
        # Fill null spending variability with 0
        pl.col('spending_variability').fill_null(0.0).alias('spending_variability'),
        
        # Ensure recency is positive
        pl.col('recency_days').abs().alias('recency_days')
    ])
    
    rfm_batches.append(batch_rfm)
    
    # Clear batch data
    del batch_data
    gc.collect()
    
    # Monitor memory every few batches
    if (i // batch_size + 1) % 5 == 0:
        current_memory = check_memory_usage()
        print(f"  Memory usage: {current_memory:.1f} MB (change: +{current_memory - initial_memory:.1f} MB)")

# Combine all batches
print("Combining RFM batches...")
customer_rfm = pl.concat(rfm_batches)
del rfm_batches
gc.collect()

print(f"✓ Created RFM features for {customer_rfm.height:,} customers")

# Calculate summary statistics on a sample to avoid memory issues
sample_rfm = customer_rfm.head(10000)
rfm_stats = sample_rfm.select([
    pl.col('recency_days').mean().alias('avg_recency'),
    pl.col('frequency').mean().alias('avg_frequency'),
    pl.col('monetary_value').mean().alias('avg_monetary')
]).to_pandas().iloc[0]

print(f"\nRFM Summary Statistics (sample):")
for metric, value in rfm_stats.items():
    print(f"  • {metric}: {value:.2f}")

# Display sample of RFM features
print(f"\nSample RFM Features:")
print(customer_rfm.head(3).to_pandas())

# Clear temporal data to free memory
del df_temporal
gc.collect()
final_memory = check_memory_usage()
print(f"Memory after RFM processing: {final_memory:.1f} MB")

Creating customer behavioural features with memory optimization...
Current memory usage: 2161.8 MB
Processing RFM features in batches...
Processing 822,211 customers in batches of 50,000
Processing batch 1/17: customers 0 to 50,000
Processing batch 2/17: customers 50,000 to 100,000
Processing batch 3/17: customers 100,000 to 150,000
Processing batch 4/17: customers 150,000 to 200,000
Processing batch 5/17: customers 200,000 to 250,000
Current memory usage: 2496.8 MB
  Memory usage: 2496.8 MB (change: +334.9 MB)
Processing batch 6/17: customers 250,000 to 300,000
Processing batch 7/17: customers 300,000 to 350,000
Processing batch 8/17: customers 350,000 to 400,000
Processing batch 9/17: customers 400,000 to 450,000
Processing batch 10/17: customers 450,000 to 500,000
Current memory usage: 2527.7 MB
  Memory usage: 2527.7 MB (change: +365.8 MB)
Processing batch 11/17: customers 500,000 to 550,000
Processing batch 12/17: customers 550,000 to 600,000
Processing batch 13/17: customers 600,

## Product Preference Features

Engineer features related to customer product preferences, including category affinity and brand loyalty.


In [20]:
print("Creating product preference features with memory optimization...")

# Clear memory and monitor
gc.collect()
initial_memory = check_memory_usage()

# Select only needed columns for product preferences
product_data = df_integrated.select([
    'customer_id', 'product_type_name', 'department_name', 
    'colour_group_name', 'price', 'sales_channel_id'
])

print("Processing product preferences...")

# Customer product category preferences - optimized
print("• Processing category preferences...")
category_preferences = (
    product_data
    .group_by(['customer_id', 'product_type_name'])
    .agg([
        pl.col('price').count().alias('category_purchase_count'),
        pl.col('price').sum().alias('category_total_spent')
    ])
    .group_by('customer_id')
    .agg([
        # Most purchased category
        pl.col('product_type_name').first().alias('most_purchased_category'),
        
        # Number of different categories purchased
        pl.col('product_type_name').n_unique().alias('category_diversity'),
        
        # Concentration ratio (top category percentage)
        (pl.col('category_purchase_count').max() / pl.col('category_purchase_count').sum()).alias('category_concentration')
    ])
)

gc.collect()
print(f"  Memory usage: {check_memory_usage():.1f} MB")

# Department preferences - optimized
print("• Processing department preferences...")
dept_preferences = (
    product_data
    .group_by(['customer_id', 'department_name'])
    .agg(pl.col('price').count().alias('dept_purchase_count'))
    .group_by('customer_id')
    .agg([
        pl.col('department_name').first().alias('preferred_department'),
        pl.col('department_name').n_unique().alias('department_diversity')
    ])
)

gc.collect()
print(f"  Memory usage: {check_memory_usage():.1f} MB")

# Colour preferences - optimized
print("• Processing colour preferences...")
colour_preferences = (
    product_data
    .group_by(['customer_id', 'colour_group_name'])
    .agg(pl.col('price').count().alias('colour_purchase_count'))
    .group_by('customer_id')
    .agg([
        pl.col('colour_group_name').first().alias('preferred_colour'),
        pl.col('colour_group_name').n_unique().alias('colour_diversity')
    ])
)

gc.collect()
print(f"  Memory usage: {check_memory_usage():.1f} MB")

# Price sensitivity features - using streaming approach
print("• Processing price behaviour...")
price_behaviour = (
    product_data
    .group_by('customer_id')
    .agg([
        pl.col('price').min().alias('min_price_paid'),
        pl.col('price').max().alias('max_price_paid'),
        pl.col('price').quantile(0.25).alias('price_q1'),
        pl.col('price').quantile(0.75).alias('price_q3'),
        pl.col('price').median().alias('median_price_paid')
    ])
    .with_columns([
        (pl.col('max_price_paid') - pl.col('min_price_paid')).alias('price_range'),
        (pl.col('price_q3') - pl.col('price_q1')).alias('price_iqr')
    ])
)

gc.collect()
print(f"  Memory usage: {check_memory_usage():.1f} MB")

# Sales channel preferences - optimized
print("• Processing channel preferences...")
channel_preferences = (
    product_data
    .group_by(['customer_id', 'sales_channel_id'])
    .agg(pl.col('price').count().alias('channel_purchases'))
    .with_columns([
        pl.when(pl.col('sales_channel_id') == 1)
        .then(pl.lit('online'))
        .otherwise(pl.lit('store'))
        .alias('channel_type')
    ])
    .group_by('customer_id')
    .agg([
        pl.col('channel_type').first().alias('preferred_channel'),
        pl.col('sales_channel_id').n_unique().alias('channel_diversity'),
        
        # Calculate online percentage
        (pl.when(pl.col('channel_type') == 'online')
         .then(pl.col('channel_purchases'))
         .otherwise(0).sum() / pl.col('channel_purchases').sum()).alias('online_purchase_ratio')
    ])
)

# Clear product data to free memory
del product_data
gc.collect()

print(f"✓ Created product preference features")
print(f"  • Category preferences: {category_preferences.height:,} customers")
print(f"  • Department preferences: {dept_preferences.height:,} customers")
print(f"  • Colour preferences: {colour_preferences.height:,} customers")
print(f"  • Price behaviour: {price_behaviour.height:,} customers")
print(f"  • Channel preferences: {channel_preferences.height:,} customers")

final_memory = check_memory_usage()
print(f"Final memory usage: {final_memory:.1f} MB (change: +{final_memory - initial_memory:.1f} MB)")

Creating product preference features with memory optimization...
Current memory usage: 2560.5 MB
Processing product preferences...
• Processing category preferences...
Current memory usage: 2073.4 MB
  Memory usage: 2073.4 MB
• Processing department preferences...
Current memory usage: 2281.4 MB
  Memory usage: 2281.4 MB
• Processing colour preferences...
Current memory usage: 2468.4 MB
  Memory usage: 2468.4 MB
• Processing price behaviour...
Current memory usage: 2551.0 MB
  Memory usage: 2551.0 MB
• Processing channel preferences...
✓ Created product preference features
  • Category preferences: 822,211 customers
  • Department preferences: 822,211 customers
  • Colour preferences: 822,211 customers
  • Price behaviour: 822,211 customers
  • Channel preferences: 822,211 customers
Current memory usage: 2685.9 MB
Final memory usage: 2685.9 MB (change: +125.4 MB)


## Demographic and Temporal Features

Create features based on customer demographics and temporal purchasing patterns.


In [21]:
print("Creating demographic and temporal features with memory optimization...")

# Clear memory and monitor
gc.collect()
initial_memory = check_memory_usage()

# Demographic features from customer data - select needed columns only
demographic_data = df_integrated.select([
    'customer_id', 'age', 'club_member_status', 'fashion_news_frequency', 'FN', 'Active'
]).unique(subset=['customer_id'])

demographic_features = (
    demographic_data
    .with_columns([
        # Age groups
        pl.when(pl.col('age') < 25).then(pl.lit('18-24'))
        .when(pl.col('age') < 35).then(pl.lit('25-34'))
        .when(pl.col('age') < 45).then(pl.lit('35-44'))
        .when(pl.col('age') < 55).then(pl.lit('45-54'))
        .when(pl.col('age') < 65).then(pl.lit('55-64'))
        .otherwise(pl.lit('65+'))
        .alias('age_group'),
        
        # Fashion engagement level
        pl.when(pl.col('fashion_news_frequency') == 'Regularly').then(pl.lit('high'))
        .when(pl.col('fashion_news_frequency') == 'Monthly').then(pl.lit('medium'))
        .otherwise(pl.lit('low'))
        .alias('fashion_engagement')
    ])
)

# Clean up demographic data
del demographic_data
gc.collect()
print(f"  Memory after demographics: {check_memory_usage():.1f} MB")

# Temporal purchasing patterns - recreate temporal data efficiently
print("Processing temporal features...")
temporal_data = df_integrated.select([
    'customer_id', 't_dat'
]).with_columns([
    pl.col('t_dat').str.to_date().alias('transaction_date')
]).with_columns([
    pl.col('transaction_date').dt.year().alias('year'),
    pl.col('transaction_date').dt.month().alias('month'),
    pl.col('transaction_date').dt.weekday().alias('weekday'),
    pl.col('transaction_date').dt.quarter().alias('quarter')
])

temporal_features = (
    temporal_data
    .group_by('customer_id')
    .agg([
        # Seasonal preferences - use mode or most frequent
        pl.col('quarter').mode().first().alias('preferred_quarter'),
        pl.col('month').mode().first().alias('preferred_month'), 
        pl.col('weekday').mode().first().alias('preferred_weekday'),
        
        # Shopping pattern diversity
        pl.col('quarter').n_unique().alias('quarter_diversity'),
        pl.col('month').n_unique().alias('month_diversity'),
        pl.col('weekday').n_unique().alias('weekday_diversity'),
        
        # Activity span
        (pl.col('year').max() - pl.col('year').min() + 1).alias('active_years')
    ])
    .with_columns([
        # Weekend vs weekday preference
        pl.when(pl.col('preferred_weekday').is_in([6, 7]))
        .then(pl.lit('weekend'))
        .otherwise(pl.lit('weekday'))
        .alias('weekend_preference'),
        
        # Season mapping
        pl.when(pl.col('preferred_quarter') == 1).then(pl.lit('winter'))
        .when(pl.col('preferred_quarter') == 2).then(pl.lit('spring'))
        .when(pl.col('preferred_quarter') == 3).then(pl.lit('summer'))
        .otherwise(pl.lit('autumn'))
        .alias('preferred_season')
    ])
)

# Clean up temporal data
del temporal_data
gc.collect()

print(f"✓ Created demographic features for {demographic_features.height:,} customers")
print(f"✓ Created temporal features for {temporal_features.height:,} customers")

# Display age group distribution
age_dist = demographic_features.group_by('age_group').agg(pl.count().alias('count')).sort('count', descending=True)
print(f"\nAge Group Distribution:")
for row in age_dist.head(6).to_dicts():
    print(f"  • {row['age_group']}: {row['count']:,} customers")

final_memory = check_memory_usage()
print(f"Final memory usage: {final_memory:.1f} MB (change: +{final_memory - initial_memory:.1f} MB)")

Creating demographic and temporal features with memory optimization...
Current memory usage: 2686.4 MB
Current memory usage: 2939.3 MB
  Memory after demographics: 2939.3 MB
Processing temporal features...
✓ Created demographic features for 822,211 customers
✓ Created temporal features for 822,211 customers

Age Group Distribution:
  • 25-34: 260,168 customers
  • 18-24: 201,741 customers
  • 45-54: 154,496 customers
  • 35-44: 101,189 customers
  • 55-64: 78,056 customers
  • 65+: 26,561 customers
Current memory usage: 2867.5 MB
Final memory usage: 2867.5 MB (change: +181.1 MB)


## Text Features (Product Descriptions)

Apply TF-IDF to extract features from product text data if available.


In [22]:
print("Creating text features from product descriptions with memory optimization...")

# Clear memory and monitor
gc.collect()
initial_memory = check_memory_usage()

# Check if we have text columns for TF-IDF
text_columns = ['product_type_name', 'product_group_name', 'department_name', 'colour_group_name']
available_text_cols = [col for col in text_columns if col in df_integrated.columns]

if available_text_cols:
    print(f"Available text columns: {available_text_cols}")
    
    # Select only needed text columns to reduce memory usage
    text_data = df_integrated.select(['customer_id'] + available_text_cols)
    
    # Create combined text features for each customer - compatible with Polars 0.20.31
    print("Processing customer text profiles...")
    customer_text_features = (
        text_data
        .group_by('customer_id')
        .agg([
            # Concatenate unique values using str.concat (compatible with 0.20.31)
            pl.col('product_type_name').unique().str.concat(' ').alias('purchased_product_types'),
            pl.col('product_group_name').unique().str.concat(' ').alias('purchased_product_groups'),
            pl.col('department_name').unique().str.concat(' ').alias('purchased_departments'),
            pl.col('colour_group_name').unique().str.concat(' ').alias('purchased_colours')
        ])
        .with_columns([
            # Combine all text into a single customer profile
            pl.concat_str([
                pl.col('purchased_product_types'),
                pl.col('purchased_product_groups'),
                pl.col('purchased_departments'),
                pl.col('purchased_colours')
            ], separator=' ').alias('customer_text_profile')
        ])
    )
    
    # Clean up text data
    del text_data
    gc.collect()
    print(f"  Memory after text aggregation: {check_memory_usage():.1f} MB")
    
    # Convert to pandas for sklearn TF-IDF (process in smaller batches if needed)
    print("Converting to pandas for TF-IDF...")
    customer_text_pd = customer_text_features.to_pandas()
    
    # Apply TF-IDF to customer text profiles
    print("Applying TF-IDF vectorisation...")
    
    # Configure TF-IDF with reduced features for memory efficiency
    tfidf = TfidfVectorizer(
        max_features=50,  # Reduced from 100 to 50 for memory efficiency
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 1),  # Only unigrams to reduce memory
        min_df=10,  # Increased minimum document frequency
        max_df=0.9  # Reduced maximum document frequency
    )
    
    try:
        # Fit and transform the customer text profiles
        print("Fitting TF-IDF...")
        tfidf_matrix = tfidf.fit_transform(customer_text_pd['customer_text_profile'].fillna(''))
        
        # Get feature names
        feature_names = tfidf.get_feature_names_out()
        
        print(f"  TF-IDF matrix shape: {tfidf_matrix.shape}")
        print(f"  Memory usage: {check_memory_usage():.1f} MB")
        
        # Apply PCA immediately to reduce memory footprint
        print("Applying PCA to reduce dimensionality...")
        pca = PCA(n_components=min(15, len(feature_names)), random_state=42)
        tfidf_pca = pca.fit_transform(tfidf_matrix.toarray())
        
        # Create PCA feature DataFrame
        pca_df = pd.DataFrame(
            tfidf_pca,
            columns=[f'tfidf_pca_{i+1}' for i in range(tfidf_pca.shape[1])],
            index=customer_text_pd['customer_id']
        ).reset_index()
        
        tfidf_pca_features = pl.from_pandas(pca_df)
        
        print(f"✓ Created {tfidf_pca.shape[1]} PCA components explaining {pca.explained_variance_ratio_.sum():.2%} of variance")
        print(f"  Top 5 TF-IDF features: {list(feature_names[:5])}")
        
        # Clean up intermediate variables
        del tfidf_matrix, pca_df, customer_text_pd
        gc.collect()
        
    except Exception as e:
        print(f"Warning: TF-IDF processing failed due to memory constraints: {e}")
        print("Skipping TF-IDF features to continue with other features...")
        tfidf_pca_features = None
    
    # Clean up text features
    del customer_text_features
    gc.collect()
    
else:
    print("No suitable text columns found for TF-IDF analysis")
    tfidf_pca_features = None

final_memory = check_memory_usage()
print(f"Final memory usage: {final_memory:.1f} MB (change: +{final_memory - initial_memory:.1f} MB)")

Creating text features from product descriptions with memory optimization...
Current memory usage: 2868.2 MB
Available text columns: ['product_type_name', 'product_group_name', 'department_name', 'colour_group_name']
Processing customer text profiles...
Current memory usage: 3927.8 MB
  Memory after text aggregation: 3927.8 MB
Converting to pandas for TF-IDF...
Applying TF-IDF vectorisation...
Fitting TF-IDF...
  TF-IDF matrix shape: (822211, 50)
Current memory usage: 3449.5 MB
  Memory usage: 3449.5 MB
Applying PCA to reduce dimensionality...
✓ Created 15 PCA components explaining 63.77% of variance
  Top 5 TF-IDF features: ['accessories', 'basic', 'beige', 'bikini', 'black']
Current memory usage: 1166.6 MB
Final memory usage: 1166.6 MB (change: +-1701.6 MB)


## Interaction Features

Create derived features by combining existing features to capture complex relationships.


In [23]:
print("Creating interaction features...")

# Start with RFM as base for interactions
interaction_features = customer_rfm.select([
    'customer_id', 'recency_days', 'frequency', 'monetary_value', 
    'avg_transaction_value', 'unique_products_purchased'
])

# Add key features from other datasets
if 'age' in demographic_features.columns:
    interaction_features = interaction_features.join(
        demographic_features.select(['customer_id', 'age']),
        on='customer_id',
        how='left'
    )

# Create interaction features
interaction_features = interaction_features.with_columns([
    # RFM score (simple scoring using percentile ranks)
    (pl.col('recency_days').rank("min") * 0.3 + 
     pl.col('frequency').rank("min") * 0.4 + 
     pl.col('monetary_value').rank("min") * 0.3).alias('rfm_score'),
    
    # Customer value segments
    (pl.col('frequency') * pl.col('avg_transaction_value')).alias('customer_value'),
    
    # Product diversity ratio
    (pl.col('unique_products_purchased').cast(pl.Float64) / pl.col('frequency').cast(pl.Float64)).alias('product_diversity_ratio'),
    
    # Spending efficiency (monetary per transaction)
    (pl.col('monetary_value') / pl.col('frequency')).alias('spending_efficiency')
])

# Add age-related interactions if age is available
if 'age' in interaction_features.columns:
    interaction_features = interaction_features.with_columns([
        # Age-spending interaction
        (pl.col('age') * pl.col('avg_transaction_value')).alias('age_spending_interaction'),
        
        # Age-frequency interaction
        (pl.col('age') * pl.col('frequency')).alias('age_frequency_interaction')
    ])

# Calculate quantiles separately using to_numpy() to extract scalar values
recency_q25 = interaction_features.select(pl.col('recency_days').quantile(0.25)).to_numpy()[0, 0]
recency_q50 = interaction_features.select(pl.col('recency_days').quantile(0.5)).to_numpy()[0, 0]
recency_q75 = interaction_features.select(pl.col('recency_days').quantile(0.75)).to_numpy()[0, 0]

frequency_q25 = interaction_features.select(pl.col('frequency').quantile(0.25)).to_numpy()[0, 0]
frequency_q50 = interaction_features.select(pl.col('frequency').quantile(0.5)).to_numpy()[0, 0]
frequency_q75 = interaction_features.select(pl.col('frequency').quantile(0.75)).to_numpy()[0, 0]

monetary_q25 = interaction_features.select(pl.col('monetary_value').quantile(0.25)).to_numpy()[0, 0]
monetary_q50 = interaction_features.select(pl.col('monetary_value').quantile(0.5)).to_numpy()[0, 0]
monetary_q75 = interaction_features.select(pl.col('monetary_value').quantile(0.75)).to_numpy()[0, 0]

# Create segments using when/then logic
interaction_features = interaction_features.with_columns([
    # Recency segments (lower recency = more recent = better)
    pl.when(pl.col('recency_days') <= recency_q25).then(pl.lit('recent'))
    .when(pl.col('recency_days') <= recency_q50).then(pl.lit('moderate'))
    .when(pl.col('recency_days') <= recency_q75).then(pl.lit('old'))
    .otherwise(pl.lit('very_old'))
    .alias('recency_segment'),
    
    # Frequency segments (higher frequency = better)
    pl.when(pl.col('frequency') >= frequency_q75).then(pl.lit('very_high'))
    .when(pl.col('frequency') >= frequency_q50).then(pl.lit('high'))
    .when(pl.col('frequency') >= frequency_q25).then(pl.lit('moderate'))
    .otherwise(pl.lit('low'))
    .alias('frequency_segment'),
    
    # Monetary segments (higher monetary = better)
    pl.when(pl.col('monetary_value') >= monetary_q75).then(pl.lit('very_high'))
    .when(pl.col('monetary_value') >= monetary_q50).then(pl.lit('high'))
    .when(pl.col('monetary_value') >= monetary_q25).then(pl.lit('moderate'))
    .otherwise(pl.lit('low'))
    .alias('monetary_segment')
])

print(f"✓ Created interaction features for {interaction_features.height:,} customers")
print(f"\nInteraction Feature Summary:")
print(f"  • RFM score range: {interaction_features['rfm_score'].min():.2f} - {interaction_features['rfm_score'].max():.2f}")
print(f"  • Customer value range: £{interaction_features['customer_value'].min():.2f} - £{interaction_features['customer_value'].max():.2f}")
print(f"  • Product diversity ratio: {interaction_features['product_diversity_ratio'].mean():.3f} (avg)")

Creating interaction features...
✓ Created interaction features for 822,211 customers

Interaction Feature Summary:
  • RFM score range: 234.40 - 796712.40
  • Customer value range: £0.00 - £5.75
  • Product diversity ratio: 0.989 (avg)


## Combine All Features

Merge all engineered features into a comprehensive customer feature dataset.


In [24]:
print("Combining all engineered features...")

# Start with customer RFM features as the base
final_features = customer_rfm

# Join product preference features
final_features = final_features.join(category_preferences, on='customer_id', how='left')
final_features = final_features.join(dept_preferences, on='customer_id', how='left')
final_features = final_features.join(colour_preferences, on='customer_id', how='left')
final_features = final_features.join(price_behaviour, on='customer_id', how='left')
final_features = final_features.join(channel_preferences, on='customer_id', how='left')

# Join demographic and temporal features
final_features = final_features.join(demographic_features, on='customer_id', how='left')
final_features = final_features.join(temporal_features, on='customer_id', how='left')

# Join TF-IDF features if available
if tfidf_pca_features is not None:
    final_features = final_features.join(tfidf_pca_features, on='customer_id', how='left')
    print(f"✓ Added TF-IDF PCA features")

# Join interaction features (excluding duplicates)
interaction_cols_to_add = [
    'customer_id', 'rfm_score', 'customer_value', 'product_diversity_ratio', 
    'spending_efficiency', 'recency_segment', 'frequency_segment', 'monetary_segment'
]

# Add age interactions if available
if 'age_spending_interaction' in interaction_features.columns:
    interaction_cols_to_add.extend(['age_spending_interaction', 'age_frequency_interaction'])

final_features = final_features.join(
    interaction_features.select(interaction_cols_to_add), 
    on='customer_id', 
    how='left'
)

print(f"\n✓ Final feature dataset created!")
print(f"  • Total customers: {final_features.height:,}")
print(f"  • Total features: {len(final_features.columns)}")
print(f"  • Memory usage: {final_features.estimated_size('mb'):.1f} MB")

# Display feature categories
feature_categories = {
    'RFM Features': ['recency_days', 'frequency', 'monetary_value', 'avg_transaction_value'],
    'Product Preferences': ['most_purchased_category', 'category_diversity', 'preferred_department'],
    'Demographics': ['age', 'age_group', 'club_member_status', 'fashion_engagement'],
    'Temporal': ['preferred_season', 'weekend_preference', 'active_years'],
    'Interaction': ['rfm_score', 'customer_value', 'product_diversity_ratio']
}

print(f"\nFeature Categories:")
for category, features in feature_categories.items():
    available_features = [f for f in features if f in final_features.columns]
    print(f"  • {category}: {len(available_features)} features")

# Display sample of final features
print(f"\nSample of Final Features:")
sample_features = final_features.head(3)
for col in final_features.columns[:10]:  # Show first 10 columns
    print(f"  • {col}: {sample_features[col].to_list()}")
if len(final_features.columns) > 10:
    print(f"  ... and {len(final_features.columns) - 10} more features")

Combining all engineered features...
✓ Added TF-IDF PCA features

✓ Final feature dataset created!
  • Total customers: 822,211
  • Total features: 68
  • Memory usage: 400.5 MB

Feature Categories:
  • RFM Features: 4 features
  • Product Preferences: 3 features
  • Demographics: 4 features
  • Temporal: 3 features
  • Interaction: 3 features

Sample of Final Features:
  • customer_id: ['c5ead8322e1bb92b4d779bd3da8029933dbf0dab8ecbee93a6f23db48013c801', '37aab04a0edc482deb4995b113d53a9ce4ce64b4c8465688a21b1eb9ab237972', '9d7ac305fad67e32843a80947e0ce76be9833e61eb109e06d18120d2ed3c3a93']
  • recency_days: [174, 138, 616]
  • frequency: [2, 1, 2]
  • monetary_value: [0.030474576271186438, 0.023711864406779665, 0.048779661016949145]
  • avg_transaction_value: [0.015237288135593219, 0.023711864406779665, 0.024389830508474573]
  • unique_products_purchased: [2, 1, 2]
  • spending_variability: [0.002396972139615415, 0.0, 0.013423043981846324]
  • customer_lifespan_days: [544, 0, 110]
  • fi

## Feature Quality Analysis

Analyse the quality and distribution of engineered features.


In [25]:
print("Analysing feature quality...")

# Check for missing values
null_counts = final_features.null_count()
total_records = final_features.height

missing_analysis = []
for col_name in final_features.columns:
    if col_name != 'customer_id':
        missing_count = null_counts[col_name][0]
        missing_percentage = (missing_count / total_records) * 100
        if missing_percentage > 0:
            missing_analysis.append({
                'feature': col_name,
                'missing_count': missing_count,
                'missing_percentage': missing_percentage
            })

if missing_analysis:
    missing_df = pd.DataFrame(missing_analysis).sort_values('missing_percentage', ascending=False)
    print(f"\nFeatures with Missing Values:")
    for _, row in missing_df.head(10).iterrows():
        print(f"  • {row['feature']}: {row['missing_percentage']:.1f}% ({row['missing_count']:,} records)")
else:
    print(f"\n✓ No missing values detected in engineered features!")

# Analyse numerical feature distributions
numerical_features = [
    'recency_days', 'frequency', 'monetary_value', 'avg_transaction_value',
    'unique_products_purchased', 'category_diversity', 'rfm_score'
]

available_numerical = [f for f in numerical_features if f in final_features.columns]

if available_numerical:
    print(f"\nNumerical Feature Statistics:")
    stats = final_features.select(available_numerical).describe().to_pandas()
    
    for feature in available_numerical[:5]:  # Show first 5 for brevity
        mean_val = final_features[feature].mean()
        std_val = final_features[feature].std()
        print(f"  • {feature}: μ={mean_val:.2f}, σ={std_val:.2f}")

# Analyse categorical feature distributions
categorical_features = [
    'age_group', 'club_member_status', 'preferred_channel', 
    'preferred_season', 'recency_segment', 'frequency_segment'
]

available_categorical = [f for f in categorical_features if f in final_features.columns]

if available_categorical:
    print(f"\nCategorical Feature Distributions:")
    for feature in available_categorical[:3]:  # Show first 3 for brevity
        dist = final_features.group_by(feature).agg(pl.count().alias('count')).sort('count', descending=True)
        top_categories = dist.head(3).to_dicts()
        # Fix the f-string syntax error by breaking it into separate parts
        category_strings = []
        for cat in top_categories:
            category_strings.append(f"{cat[feature]}({cat['count']})")
        print(f"  • {feature}: {', '.join(category_strings)}...")

print(f"\n✓ Feature quality analysis completed!")

Analysing feature quality...

Features with Missing Values:
  • Active: 62.9% (516,989 records)
  • FN: 62.1% (510,590 records)
  • fashion_news_frequency: 0.8% (6,934 records)
  • age: 0.8% (6,661 records)
  • age_spending_interaction: 0.8% (6,661 records)
  • age_frequency_interaction: 0.8% (6,661 records)
  • club_member_status: 0.3% (2,389 records)

Numerical Feature Statistics:
  • recency_days: μ=269.47, σ=213.96
  • frequency: μ=3.87, σ=4.81
  • monetary_value: μ=0.11, σ=0.15
  • avg_transaction_value: μ=0.03, σ=0.01
  • unique_products_purchased: μ=3.79, σ=4.66

Categorical Feature Distributions:
  • age_group: 25-34(260168), 18-24(201741), 45-54(154496)...
  • club_member_status: ACTIVE(782830), PRE-CREATE(36728), None(2389)...
  • preferred_channel: store(520548), online(301663)...

✓ Feature quality analysis completed!


## Save Engineered Features

Save the final engineered feature dataset for use in machine learning models.


In [26]:
print("Saving engineered features...")

# Create output directory if it doesn't exist
output_dir = os.path.join(data_dir, 'processed')
os.makedirs(output_dir, exist_ok=True)

# Save main feature dataset
features_path = os.path.join(output_dir, 'hm_engineered_features.parquet')
final_features.write_parquet(features_path)
print(f"✓ Saved engineered features to: {features_path}")

# Save individual feature components for analysis
components = {
    'rfm_features': customer_rfm,
    'product_preferences': category_preferences,
    'demographic_features': demographic_features,
    'temporal_features': temporal_features
}

for name, dataset in components.items():
    component_path = os.path.join(output_dir, f'hm_{name}.parquet')
    dataset.write_parquet(component_path)
    print(f"✓ Saved {name} to: {component_path}")

# Save TF-IDF features separately if they exist
if tfidf_pca_features is not None:
    tfidf_path = os.path.join(output_dir, 'hm_tfidf_features.parquet')
    tfidf_pca_features.write_parquet(tfidf_path)
    print(f"✓ Saved TF-IDF features to: {tfidf_path}")

# Create feature documentation
feature_docs = {
    'dataset_info': {
        'total_customers': final_features.height,
        'total_features': len(final_features.columns),
        'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'memory_usage_mb': final_features.estimated_size('mb')
    },
    'feature_categories': {
        'behavioural_rfm': ['recency_days', 'frequency', 'monetary_value', 'avg_transaction_value'],
        'product_preferences': ['most_purchased_category', 'category_diversity', 'preferred_department'],
        'demographics': ['age', 'age_group', 'club_member_status', 'fashion_engagement'],
        'temporal_patterns': ['preferred_season', 'weekend_preference', 'active_years'],
        'interactions': ['rfm_score', 'customer_value', 'product_diversity_ratio']
    }
}

import json
docs_path = os.path.join(output_dir, 'feature_documentation.json')
with open(docs_path, 'w') as f:
    json.dump(feature_docs, f, indent=2)
print(f"✓ Saved feature documentation to: {docs_path}")

print(f"\n🎉 Feature engineering completed successfully!")
print(f"\nSummary:")
print(f"  • Customers processed: {final_features.height:,}")
print(f"  • Features created: {len(final_features.columns)}")
print(f"  • Output files: {len(components) + 2} datasets saved")
print(f"  • Ready for machine learning model training!")

Saving engineered features...
✓ Saved engineered features to: ../data/processed/hm_engineered_features.parquet
✓ Saved rfm_features to: ../data/processed/hm_rfm_features.parquet
✓ Saved product_preferences to: ../data/processed/hm_product_preferences.parquet
✓ Saved demographic_features to: ../data/processed/hm_demographic_features.parquet
✓ Saved temporal_features to: ../data/processed/hm_temporal_features.parquet
✓ Saved TF-IDF features to: ../data/processed/hm_tfidf_features.parquet
✓ Saved feature documentation to: ../data/processed/feature_documentation.json

🎉 Feature engineering completed successfully!

Summary:
  • Customers processed: 822,211
  • Features created: 68
  • Output files: 6 datasets saved
  • Ready for machine learning model training!
