In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from etl.util import prepare_dataset_without_leakage
from ml_features.features import prepare_features
from ml_features.customer_features import create_customer_features
from ml_features.sequence_features  import create_sequence_features
from ml_training.train_rf import train_rf
from ml_evaluation.dashboard import model_evaluation_report

import warnings
warnings.filterwarnings('ignore')

# Load original clean quote data
df_quotes = pd.read_csv('cleaned_quote_data.csv')
df_quotes['dt_creation_devis'] = pd.to_datetime(df_quotes['dt_creation_devis'])

print(f"\nüìä Original quote data: {len(df_quotes):,} quotes from {df_quotes['numero_compte'].nunique():,} customers")


print("\n" + "="*80)
print("STRATEGY: CREATE MEANINGFUL SEQUENCE FEATURES")
print("="*80)

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from etl.util import prepare_dataset_without_leakage
from ml_features.features import prepare_features
from ml_features.customer_features import create_customer_features
from ml_features.sequence_features  import create_sequence_features
from ml_training.train_rf import train_rf
from ml_evaluation.dashboard import model_evaluation_report

import warnings
warnings.filterwarnings('ignore')

# Load original clean quote data
df_quotes = pd.read_csv('cleaned_quote_data.csv')
df_quotes['dt_creation_devis'] = pd.to_datetime(df_quotes['dt_creation_devis'])

print(f"\nüìä Original quote data: {len(df_quotes):,} quotes from {df_quotes['numero_compte'].nunique():,} customers")


print("\n" + "="*80)
print("STRATEGY: CREATE MEANINGFUL SEQUENCE FEATURES")
print("="*80)

# 1. Enhanced customer features
customer_df = create_customer_features(df_quotes)
len(customer_df)

# 2. Sequence features (for multi-quote customers)
sequence_df = create_sequence_features(df_quotes)
print(sequence_df.columns)

sequence_df = pd.merge(
    sequence_df, 
    customer_df, 
    on='numero_compte', 
    how='left',
    suffixes=('_seq', '')  # Explicit suffixes
)

print("Columns:", sequence_df.columns.tolist())

# Now it's clear which column is which
y_sequence = sequence_df['converted']  # From sequence features
y_customer = sequence_df['converted']  # From customer features

# For modeling, use the sequence version
X_customer = customer_df.drop(columns=['numero_compte', 'converted'], errors='ignore')
X_customer_clean, y_customer_clean = prepare_features(X_customer, y_customer, "Customer Features")

columns_to_drop =  [x for x in sequence_df.columns if '_seq' in x]
columns_to_drop.extend(['numero_compte', 'converted'])
X_sequence = sequence_df.drop(columns=columns_to_drop, errors='ignore')

X_sequence_clean, y_sequence_clean = prepare_features(X_sequence, y_sequence, "Sequence Features")


üìä Original quote data: 34,014 quotes from 23,888 customers

STRATEGY: CREATE MEANINGFUL SEQUENCE FEATURES

üìä Original quote data: 34,014 quotes from 23,888 customers

STRATEGY: CREATE MEANINGFUL SEQUENCE FEATURES
Creating enhanced customer features...
  Total customers: 23,888
‚úì Created features for 23,888 customers
‚úì New features: ['numero_compte', 'total_quotes', 'converted', 'avg_days_between_quotes', 'std_days_between_quotes', 'max_days_between_quotes', 'engagement_density', 'price_trajectory', 'unique_product_families', 'product_consistency']...
Creating sequence features (this may take a moment)...
  Total customers: 23,888
‚úì Created features for 23,888 customers
‚úì New features: ['numero_compte', 'total_quotes', 'converted', 'avg_days_since_first_quote', 'std_days_since_first_quote', 'max_days_since_first_quote', 'avg_recent_quote_count', 'std_recent_quote_count', 'avg_recent_avg_price', 'std_recent_avg_price']...
Index(['numero_compte', 'total_quotes', 'converted'

In [2]:
from dl_training.train import train_advanced_dl_model
from dl_features.features import create_dl_specific_features

X_dl_optimized, y_dl = create_dl_specific_features(X_sequence_clean, y_sequence_clean)


CREATING SAFE DL-OPTIMIZED FEATURES (V2)
üìä Input shape: (23888, 32)
üìã Found 32 numeric columns

üîß Step 1: Scaling features to reasonable range...

üîß Step 2: Adding safe transformations to ALL numeric features...

üîß Step 3: Adding safe interactions...
    ‚úì Added interaction: avg_days_since_first_quote / std_days_since_first_quote

üîß Step 4: Clipping all features to safe range...

‚úÖ SAFE DL Features Created:
  Original: 32 features
  Final: 129 features
  Added: 97 new features

üìä Safe value ranges:
  Min: -10.00
  Max: 10.00
  Mean: 0.34


In [3]:
def create_focused_features(X, y):
    """Create new features focused on what matters most"""
    X_focused = X.copy()
    
    # Double down on conversion rate features
    if 'avg_recent_conversion_rate' in X.columns:
        # More transformations of the most important feature
        X_focused['conversion_rate_exp'] = np.exp(X['avg_recent_conversion_rate'].clip(-10, 10))
        X_focused['conversion_rate_power3'] = X['avg_recent_conversion_rate'] ** 3
        X_focused['conversion_rate_sigmoid'] = 1 / (1 + np.exp(-X['avg_recent_conversion_rate']))
    
    # Agency-conversion interactions
    if 'main_agency_log' in X.columns and 'avg_recent_conversion_rate' in X.columns:
        X_focused['agency_conversion_interaction'] = X['main_agency_log'] * X['avg_recent_conversion_rate']
    
    # Discount-conversion interactions  
    if 'avg_discount_pct_abs_sqrt' in X.columns and 'avg_recent_conversion_rate' in X.columns:
        X_focused['discount_conversion_interaction'] = X['avg_discount_pct_abs_sqrt'] * X['avg_recent_conversion_rate']
    
    # Price-conversion interactions
    price_cols = [c for c in X.columns if 'price' in c.lower() and 'conversion' not in c.lower()]
    if price_cols and 'avg_recent_conversion_rate' in X.columns:
        for price_col in price_cols[:3]:  # Top 3 price features
            X_focused[f'{price_col}_conversion_interaction'] = X[price_col] * X['avg_recent_conversion_rate']
    
    print(f"Added {X_focused.shape[1] - X.shape[1]} focused features")
    return X_focused

# Create focused features
X_focused = create_focused_features(X_dl_optimized, y_dl)

Added 8 focused features


In [4]:
def enhance_region_features(X):
    """Enhance region-related features since they're most important"""
    X_enhanced = X.copy()
    
    if 'main_region' in X.columns:
        # Region is categorical but encoded as numeric - create better features
        # Create region clusters if you have more info
        X_enhanced['region_is_popular'] = (X['main_region'] == X['main_region'].mode()[0]).astype(int)
        
        # Region interactions with price
        if 'avg_current_price' in X.columns:
            X_enhanced['region_price_interaction'] = X['main_region'] * X['avg_current_price']
        
        # Region interactions with discount
        if 'avg_discount_pct' in X.columns:
            X_enhanced['region_discount_interaction'] = X['main_region'] * X['avg_discount_pct']
    
    print(f"Added {X_enhanced.shape[1] - X.shape[1]} region-focused features")
    return X_enhanced

X_region_enhanced = enhance_region_features(X_focused)

Added 3 region-focused features


In [5]:
def enhance_discount_features(X):
    """Discounts are #2 important - enhance them"""
    X_enhanced = X.copy()
    
    if 'avg_discount_pct' in X.columns:
        # Discount tiers
        X_enhanced['discount_tier'] = pd.cut(
            X['avg_discount_pct'], 
            bins=[-np.inf, 0, 5, 10, 20, np.inf],
            labels=['negative', 'small', 'medium', 'large', 'very_large']
        ).cat.codes
        
        # Is there any discount?
        X_enhanced['has_discount'] = (X['avg_discount_pct'] > 0).astype(int)
        
        # Discount effectiveness (interact with price)
        if 'avg_current_price' in X.columns:
            X_enhanced['discount_price_ratio'] = X['avg_discount_pct'] / (X['avg_current_price'] + 1)
    
    print(f"Added {X_enhanced.shape[1] - X.shape[1]} discount-focused features")
    return X_enhanced

X_discount_enhanced = enhance_region_features(X_region_enhanced)

Added 0 region-focused features


In [6]:
model, auc = train_advanced_dl_model(
    X_discount_enhanced, y_dl 
)


üöÄ Training ADVANCED DL Model...
üîß Normalizing features for DL...
  Before: min=-20.00, max=20.00
  Before: mean=0.33, std=1.47
  After: min=-5.43, max=6.06
  After: mean=0.65, std=1.53
  Normalization complete!
üîß Normalizing features for DL...
  Before: min=-20.00, max=13.41
  Before: mean=0.33, std=1.48
  After: min=-1.12, max=4.58
  After: mean=1.25, std=1.59
  Normalization complete!
  Parameters: 96,333
  Model: advanced
  Input dim: 140
  Parameters: 96,333
  Training samples: 19,110
  Validation samples: 4,778
  ‚úì Epoch 1: Loss=0.8667, Val AUC=0.6608
  ‚úì Epoch 2: Loss=0.8149, Val AUC=0.6718
  ‚úì Epoch 3: Loss=0.7968, Val AUC=0.6743
  ‚úì Epoch 4: Loss=0.7900, Val AUC=0.6784
  ‚úì Epoch 5: Loss=0.7834, Val AUC=0.6800
  ‚úì Epoch 6: Loss=0.7808, Val AUC=0.6802
  ‚úì Epoch 7: Loss=0.7792, Val AUC=0.6807
  ‚úì Epoch 9: Loss=0.7735, Val AUC=0.6808
  ‚úì Epoch 10: Loss=0.7725, Val AUC=0.6842
  ‚úì Epoch 19: Loss=0.7620, Val AUC=0.6872
  ‚èπÔ∏è Early stopping at epoch 44


In [7]:
import torch
def analyze_attention_weights(model, X_sample):
    """
    Analyze which HIDDEN features the attention mechanism focuses on
    """
    model.eval()
    
    # Get a sample batch
    if isinstance(X_sample, pd.DataFrame):
        X_tensor = torch.FloatTensor(X_sample.values[:100])  # First 100 samples
    else:
        X_tensor = torch.FloatTensor(X_sample[:100])
    
    with torch.no_grad():
        # Forward pass through network
        features = model.net(X_tensor)  # Shape: [100, 64] (hidden features)
        attention_weights = model.attention(features)  # Shape: [100, 64]
        
        # Get average attention per HIDDEN feature
        avg_attention = attention_weights.mean(dim=0).squeeze().numpy()
    
    print(f"Input features: {X_sample.shape[1]}")
    print(f"Hidden features: {features.shape[1]}")
    
    # Create importance DataFrame for HIDDEN features
    importance_df = pd.DataFrame({
        'hidden_feature_idx': list(range(len(avg_attention))),  # FIX: list of ints
        'attention_weight': avg_attention
    }).sort_values('attention_weight', ascending=False)
    
    print("\n" + "="*80)
    print("ATTENTION-BASED HIDDEN FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 hidden features by attention weight:")
    for i, row in importance_df.head(20).iterrows():
        # FIX: Convert to int for formatting
        feat_idx = int(row['hidden_feature_idx'])
        print(f"  Hidden feature {feat_idx:3d} | Attention: {row['attention_weight']:.4f}")
    
    # Check if attention is actually working
    variance = importance_df['attention_weight'].var()
    print(f"\nüîç Attention variance: {variance:.6f}")
    if variance < 0.001:
        print("‚ö†Ô∏è  WARNING: Attention weights are nearly identical!")
    else:
        print("‚úÖ GOOD: Attention weights vary across features")
    
    return importance_df

# BETTER: Analyze which INPUT features matter using gradients
def analyze_input_feature_importance(model, X_sample):
    """
    Analyze which INPUT features matter using gradient-based importance
    """
    model.eval()
    
    # Convert to tensor with gradient tracking
    X_tensor = torch.FloatTensor(X_sample.values[:100])
    X_tensor.requires_grad = True
    
    # Forward pass
    output = model(X_tensor)
    
    # Create dummy target for gradient computation
    dummy_target = torch.ones_like(output)
    
    # Backward pass to get gradients w.r.t inputs
    model.zero_grad()
    output.backward(dummy_target)
    
    # Get average absolute gradient per INPUT feature
    gradients = X_tensor.grad.abs().mean(dim=0).numpy()
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': X_sample.columns.tolist(),
        'gradient_importance': gradients
    }).sort_values('gradient_importance', ascending=False)
    
    print("\n" + "="*80)
    print("GRADIENT-BASED INPUT FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 input features by gradient magnitude:")
    for i, row in importance_df.head(20).iterrows():
        print(f"  {row['feature']:40s} | Gradient: {row['gradient_importance']:.6f}")
    
    # Also show feature categories
    print(f"\nüîç FEATURE CATEGORY ANALYSIS:")
    
    categories = {
        'Price': ['price'],
        'Quote': ['quote'],
        'Day': ['day'],
        'Average': ['avg_'],
        'Std Dev': ['std_'],
        'Trend': ['trend'],
        'Ratio': ['ratio', 'div', 'per'],
        'Log': ['log'],
        'Squared': ['squared'],
        'Tanh': ['tanh'],
        'Sqrt': ['sqrt']
    }
    
    for cat_name, keywords in categories.items():
        cat_features = [f for f in importance_df['feature'] 
                       if any(kw in f.lower() for kw in keywords)]
        
        if cat_features:
            cat_importance = importance_df[
                importance_df['feature'].isin(cat_features)
            ]['gradient_importance'].mean()
            
            print(f"  {cat_name:10s}: {len(cat_features):2d} features | "
                  f"Avg importance: {cat_importance:.6f}")
    
    return importance_df

# Usage
importance_df = analyze_input_feature_importance(model, X_region_enhanced)


GRADIENT-BASED INPUT FEATURE IMPORTANCE

Top 20 input features by gradient magnitude:
  region_discount_interaction              | Gradient: 0.206889
  avg_price                                | Gradient: 0.169857
  avg_discount_pct_abs_sqrt                | Gradient: 0.169613
  avg_recent_conversion_rate_tanh          | Gradient: 0.157406
  avg_current_price                        | Gradient: 0.127763
  avg_days_between_quotes_log              | Gradient: 0.122048
  avg_discount_pct_tanh                    | Gradient: 0.112039
  avg_discount_pct_log                     | Gradient: 0.101935
  std_days_since_first_quote_log           | Gradient: 0.098437
  main_region_tanh                         | Gradient: 0.098250
  std_recent_conversion_rate_abs_sqrt      | Gradient: 0.098126
  region_price_interaction                 | Gradient: 0.096191
  std_recent_avg_price                     | Gradient: 0.095091
  main_region_log                          | Gradient: 0.090296
  avg_price_abs_s

In [8]:
def gradient_based_importance(model, X_sample, y_sample):
    """
    Compute feature importance using gradients (Integrated Gradients-like)
    """
    model.eval()
    
    # Convert to tensor
    X_tensor = torch.FloatTensor(X_sample.values[:100])
    X_tensor.requires_grad = True
    
    # Forward pass
    output = model(X_tensor)
    
    # Create dummy target (we want gradients w.r.t inputs)
    dummy_target = torch.ones_like(output)
    
    # Backward pass to get gradients
    model.zero_grad()
    output.backward(dummy_target)
    
    # Get average absolute gradient per feature
    gradients = X_tensor.grad.abs().mean(dim=0).numpy()
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': X_sample.columns.tolist(),
        'gradient_importance': gradients
    }).sort_values('gradient_importance', ascending=False)
    
    print("\n" + "="*80)
    print("GRADIENT-BASED FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 features by gradient magnitude:")
    for i, row in importance_df.head(20).iterrows():
        print(f"  {row['feature']:40s} | Gradient: {row['gradient_importance']:.6f}")
    
    return importance_df

# Usage
grad_importance = gradient_based_importance(model, X_region_enhanced, y_dl)


GRADIENT-BASED FEATURE IMPORTANCE

Top 20 features by gradient magnitude:
  region_discount_interaction              | Gradient: 0.206889
  avg_price                                | Gradient: 0.169857
  avg_discount_pct_abs_sqrt                | Gradient: 0.169613
  avg_recent_conversion_rate_tanh          | Gradient: 0.157406
  avg_current_price                        | Gradient: 0.127763
  avg_days_between_quotes_log              | Gradient: 0.122048
  avg_discount_pct_tanh                    | Gradient: 0.112039
  avg_discount_pct_log                     | Gradient: 0.101935
  std_days_since_first_quote_log           | Gradient: 0.098437
  main_region_tanh                         | Gradient: 0.098250
  std_recent_conversion_rate_abs_sqrt      | Gradient: 0.098126
  region_price_interaction                 | Gradient: 0.096191
  std_recent_avg_price                     | Gradient: 0.095091
  main_region_log                          | Gradient: 0.090296
  avg_price_abs_sqrt         