In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from etl.util import prepare_dataset_without_leakage
from ml_features.features import prepare_features
from ml_features.customer_features import create_customer_features
from ml_features.sequence_features  import create_sequence_features
from ml_features.brand_features import create_brand_features
from ml_features.model_features import create_model_features
from ml_features.market_features import create_market_features
from ml_features.equipment_features import create_equipment_features
from ml_features.solution_complexity_features import create_solution_complexity_features
from ml_features.timeline_features import create_timeline_features, create_advanced_timeline_features, create_timeline_interaction_features
from ml_features.role_features import create_commercial_role_features
from ml_features.process_features import create_process_features
from ml_features.correction_features import create_correction_features
from ml_training.train_rf import train_rf
from ml_evaluation.dashboard import model_evaluation_report

import warnings
warnings.filterwarnings('ignore')

# Load original clean quote data
df_quotes = pd.read_csv('cleaned_quote_data.csv')
df_quotes['dt_creation_devis'] = pd.to_datetime(df_quotes['dt_creation_devis'])

print(f"\nüìä Original quote data: {len(df_quotes):,} quotes from {df_quotes['numero_compte'].nunique():,} customers")

print("\n" + "="*80)
print("STRATEGY: CREATE FEATURES")
print("="*80)

# Create feature list
feature_funcs = [create_customer_features, create_sequence_features, create_brand_features, 
                 create_model_features, create_market_features,
                 create_equipment_features, create_solution_complexity_features,
                 create_timeline_features, create_advanced_timeline_features,
                 create_commercial_role_features, create_process_features, create_correction_features]


new_df = feature_funcs[0](df_quotes)
customer_df = new_df
for func in feature_funcs[1:]:
    new_df_ = func(df_quotes)

    new_df = pd.merge(new_df, new_df_, on='numero_compte', how='left', suffixes=('_dup', ''))
    new_df = new_df.drop(columns=[x for x in new_df.columns if '_dup' in x], errors='ignore')
    print(len(new_df))
    if func == create_sequence_features: sequence_df = new_df


# Now it's clear which column is which
y_new = new_df['converted']  # From sequence features
y_sequence = sequence_df['converted']  # From sequence features
y_customer = customer_df['converted']  # From customer features

# For modeling, use the sequence version
X_customer = customer_df.drop(columns=['numero_compte', 'converted'], errors='ignore')
X_customer_clean, y_customer_clean = prepare_features(X_customer, y_customer, "Customer Features")

columns_to_drop =  [x for x in sequence_df.columns if '_seq' in x]
columns_to_drop.extend(['numero_compte', 'converted'])
X_sequence = sequence_df.drop(columns=columns_to_drop, errors='ignore')
X_sequence_clean, y_sequence_clean = prepare_features(X_sequence, y_sequence, "Sequence Features")

new_df = create_timeline_interaction_features(new_df)
X_new = new_df.drop(columns=['numero_compte', 'converted'], errors='ignore')
X_new_clean, y_new_clean = prepare_features(X_new, y_new, "New Features")


üìä Original quote data: 34,014 quotes from 23,888 customers

STRATEGY: CREATE FEATURES
Creating OPTIMIZED customer features (mode: first_conversion)...
  Filtering post-first-purchase data...
  Customers: 23,888, Quotes: 33,247
  Calculating features...
  Calculating price trajectory (optimized)...
‚úì Created 14 leakage-free features
‚Üí 23,888 customers | 39.6% converters
‚è±Ô∏è  Execution time: 10.4 seconds
‚ö†Ô∏è  10.4s (target was 3s)
CREATING FIRST CONVERSION PREDICTION FEATURES (LEAKAGE-FREE)
  Total customers: 23,888
‚ö° Processing customers with corrected first-conversion logic...
  Processed 0/23,888 customers
  Processed 5,000/23,888 customers
  Processed 10,000/23,888 customers
  Processed 15,000/23,888 customers
  Processed 20,000/23,888 customers
‚úÖ First-conversion features calculation complete

üîç VALIDATION REPORT:
   Total customers: 23,888
   First converters: 9,458 (39.6%)
   Never converters: 14,430

üìä Distribution check:
   Converters with 0 historical qu

In [2]:
from dl_training.train import train_advanced_dl_model
from dl_features.features import create_dl_specific_features

X_dl_optimized, y_dl = create_dl_specific_features(X_new_clean, y_new_clean)


CREATING SAFE DL-OPTIMIZED FEATURES (V2)
üìä Input shape: (23888, 209)
üìã Found 209 numeric columns

üîß Step 1: Scaling features to reasonable range...

üîß Step 2: Adding safe transformations to ALL numeric features...

üîß Step 3: Adding safe interactions...
    ‚úì Added interaction: std_days_between_quotes / price_trajectory

üîß Step 4: Clipping all features to safe range...

‚úÖ SAFE DL Features Created:
  Original: 209 features
  Final: 837 features
  Added: 628 new features

üìä Safe value ranges:
  Min: -10.00
  Max: 10.00
  Mean: 0.15


In [3]:
def create_focused_features(X, y):
    """Create new features focused on what matters most"""
    X_focused = X.copy()
    
    # Double down on conversion rate features
    if 'avg_recent_conversion_rate' in X.columns:
        # More transformations of the most important feature
        X_focused['conversion_rate_exp'] = np.exp(X['avg_recent_conversion_rate'].clip(-10, 10))
        X_focused['conversion_rate_power3'] = X['avg_recent_conversion_rate'] ** 3
        X_focused['conversion_rate_sigmoid'] = 1 / (1 + np.exp(-X['avg_recent_conversion_rate']))
    
    # Agency-conversion interactions
    if 'main_agency_log' in X.columns and 'avg_recent_conversion_rate' in X.columns:
        X_focused['agency_conversion_interaction'] = X['main_agency_log'] * X['avg_recent_conversion_rate']
    
    # Discount-conversion interactions  
    if 'avg_discount_pct_abs_sqrt' in X.columns and 'avg_recent_conversion_rate' in X.columns:
        X_focused['discount_conversion_interaction'] = X['avg_discount_pct_abs_sqrt'] * X['avg_recent_conversion_rate']
    
    # Price-conversion interactions
    price_cols = [c for c in X.columns if 'price' in c.lower() and 'conversion' not in c.lower()]
    if price_cols and 'avg_recent_conversion_rate' in X.columns:
        for price_col in price_cols[:3]:  # Top 3 price features
            X_focused[f'{price_col}_conversion_interaction'] = X[price_col] * X['avg_recent_conversion_rate']
    
    print(f"Added {X_focused.shape[1] - X.shape[1]} focused features")
    return X_focused

# Create focused features
X_focused = create_focused_features(X_dl_optimized, y_dl)

Added 0 focused features


In [4]:
def enhance_region_features(X):
    """Enhance region-related features since they're most important"""
    X_enhanced = X.copy()
    
    if 'main_region' in X.columns:
        # Region is categorical but encoded as numeric - create better features
        # Create region clusters if you have more info
        X_enhanced['region_is_popular'] = (X['main_region'] == X['main_region'].mode()[0]).astype(int)
        
        # Region interactions with price
        if 'avg_current_price' in X.columns:
            X_enhanced['region_price_interaction'] = X['main_region'] * X['avg_current_price']
        
        # Region interactions with discount
        if 'avg_discount_pct' in X.columns:
            X_enhanced['region_discount_interaction'] = X['main_region'] * X['avg_discount_pct']
    
    print(f"Added {X_enhanced.shape[1] - X.shape[1]} region-focused features")
    return X_enhanced

X_region_enhanced = enhance_region_features(X_focused)

Added 3 region-focused features


In [5]:
def enhance_discount_features(X):
    """Discounts are #2 important - enhance them"""
    X_enhanced = X.copy()
    
    if 'avg_discount_pct' in X.columns:
        # Discount tiers
        X_enhanced['discount_tier'] = pd.cut(
            X['avg_discount_pct'], 
            bins=[-np.inf, 0, 5, 10, 20, np.inf],
            labels=['negative', 'small', 'medium', 'large', 'very_large']
        ).cat.codes
        
        # Is there any discount?
        X_enhanced['has_discount'] = (X['avg_discount_pct'] > 0).astype(int)
        
        # Discount effectiveness (interact with price)
        if 'avg_current_price' in X.columns:
            X_enhanced['discount_price_ratio'] = X['avg_discount_pct'] / (X['avg_current_price'] + 1)
    
    print(f"Added {X_enhanced.shape[1] - X.shape[1]} discount-focused features")
    return X_enhanced

X_discount_enhanced = enhance_region_features(X_region_enhanced)

Added 0 region-focused features


In [6]:
model, auc = train_advanced_dl_model(
    X_discount_enhanced, y_dl 
)


üöÄ Training ADVANCED DL Model...
üîß Normalizing features for DL...
  Before: min=-20.00, max=20.00
  Before: mean=0.14, std=0.80
  After: min=-6.17, max=6.60
  After: mean=0.54, std=1.78
  Normalization complete!
üîß Normalizing features for DL...
  Before: min=-20.00, max=13.58
  Before: mean=0.15, std=0.80
  After: min=-2.69, max=13.72
  After: mean=0.86, std=2.09
  Normalization complete!
  Parameters: 365,833
  Model: advanced
  Input dim: 840
  Parameters: 365,833
  Training samples: 19,110
  Validation samples: 4,778
  ‚úì Epoch 1: Loss=0.9000, Val AUC=0.6598
  ‚úì Epoch 2: Loss=0.8266, Val AUC=0.6832
  ‚úì Epoch 3: Loss=0.7953, Val AUC=0.7052
  ‚úì Epoch 4: Loss=0.7723, Val AUC=0.7072
  ‚úì Epoch 5: Loss=0.7627, Val AUC=0.7172
  ‚úì Epoch 6: Loss=0.7525, Val AUC=0.7188
  ‚úì Epoch 11: Loss=0.7202, Val AUC=0.7188
  ‚úì Epoch 12: Loss=0.7172, Val AUC=0.7225
  ‚èπÔ∏è Early stopping at epoch 37

‚úÖ Training Complete!
  Best Val AUC: 0.7225


In [7]:
import torch
def analyze_attention_weights(model, X_sample):
    """
    Analyze which HIDDEN features the attention mechanism focuses on
    """
    model.eval()
    
    # Get a sample batch
    if isinstance(X_sample, pd.DataFrame):
        X_tensor = torch.FloatTensor(X_sample.values[:100])  # First 100 samples
    else:
        X_tensor = torch.FloatTensor(X_sample[:100])
    
    with torch.no_grad():
        # Forward pass through network
        features = model.net(X_tensor)  # Shape: [100, 64] (hidden features)
        attention_weights = model.attention(features)  # Shape: [100, 64]
        
        # Get average attention per HIDDEN feature
        avg_attention = attention_weights.mean(dim=0).squeeze().numpy()
    
    print(f"Input features: {X_sample.shape[1]}")
    print(f"Hidden features: {features.shape[1]}")
    
    # Create importance DataFrame for HIDDEN features
    importance_df = pd.DataFrame({
        'hidden_feature_idx': list(range(len(avg_attention))),  # FIX: list of ints
        'attention_weight': avg_attention
    }).sort_values('attention_weight', ascending=False)
    
    print("\n" + "="*80)
    print("ATTENTION-BASED HIDDEN FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 hidden features by attention weight:")
    for i, row in importance_df.head(20).iterrows():
        # FIX: Convert to int for formatting
        feat_idx = int(row['hidden_feature_idx'])
        print(f"  Hidden feature {feat_idx:3d} | Attention: {row['attention_weight']:.4f}")
    
    # Check if attention is actually working
    variance = importance_df['attention_weight'].var()
    print(f"\nüîç Attention variance: {variance:.6f}")
    if variance < 0.001:
        print("‚ö†Ô∏è  WARNING: Attention weights are nearly identical!")
    else:
        print("‚úÖ GOOD: Attention weights vary across features")
    
    return importance_df

# BETTER: Analyze which INPUT features matter using gradients
def analyze_input_feature_importance(model, X_sample):
    """
    Analyze which INPUT features matter using gradient-based importance
    """
    model.eval()
    
    # Convert to tensor with gradient tracking
    X_tensor = torch.FloatTensor(X_sample.values[:100])
    X_tensor.requires_grad = True
    
    # Forward pass
    output = model(X_tensor)
    
    # Create dummy target for gradient computation
    dummy_target = torch.ones_like(output)
    
    # Backward pass to get gradients w.r.t inputs
    model.zero_grad()
    output.backward(dummy_target)
    
    # Get average absolute gradient per INPUT feature
    gradients = X_tensor.grad.abs().mean(dim=0).numpy()
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': X_sample.columns.tolist(),
        'gradient_importance': gradients
    }).sort_values('gradient_importance', ascending=False)
    
    print("\n" + "="*80)
    print("GRADIENT-BASED INPUT FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 input features by gradient magnitude:")
    for i, row in importance_df.head(20).iterrows():
        print(f"  {row['feature']:40s} | Gradient: {row['gradient_importance']:.6f}")
    
    # Also show feature categories
    print(f"\nüîç FEATURE CATEGORY ANALYSIS:")
    
    categories = {
        'Price': ['price'],
        'Quote': ['quote'],
        'Day': ['day'],
        'Average': ['avg_'],
        'Std Dev': ['std_'],
        'Trend': ['trend'],
        'Ratio': ['ratio', 'div', 'per'],
        'Log': ['log'],
        'Squared': ['squared'],
        'Tanh': ['tanh'],
        'Sqrt': ['sqrt']
    }
    
    for cat_name, keywords in categories.items():
        cat_features = [f for f in importance_df['feature'] 
                       if any(kw in f.lower() for kw in keywords)]
        
        if cat_features:
            cat_importance = importance_df[
                importance_df['feature'].isin(cat_features)
            ]['gradient_importance'].mean()
            
            print(f"  {cat_name:10s}: {len(cat_features):2d} features | "
                  f"Avg importance: {cat_importance:.6f}")
    
    return importance_df

# Usage
importance_df = analyze_input_feature_importance(model, X_region_enhanced)


GRADIENT-BASED INPUT FEATURE IMPORTANCE

Top 20 input features by gradient magnitude:
  had_historical_quotes_tanh               | Gradient: 0.401832
  total_historical_quotes                  | Gradient: 0.376201
  total_historical_quotes_abs_sqrt         | Gradient: 0.345707
  total_historical_quotes_log              | Gradient: 0.330201
  had_historical_quotes_log                | Gradient: 0.325407
  total_historical_quotes_tanh             | Gradient: 0.278469
  had_historical_quotes_abs_sqrt           | Gradient: 0.272736
  region_discount_interaction              | Gradient: 0.251772
  total_quotes                             | Gradient: 0.240863
  avg_price_tanh                           | Gradient: 0.216465
  _total_quotes                            | Gradient: 0.208724
  avg_discount_pct_abs_sqrt                | Gradient: 0.199091
  avg_discount_abs_sqrt                    | Gradient: 0.198241
  peak_weekday                             | Gradient: 0.182736
  avg_price      

In [8]:
def gradient_based_importance(model, X_sample, y_sample):
    """
    Compute feature importance using gradients (Integrated Gradients-like)
    """
    model.eval()
    
    # Convert to tensor
    X_tensor = torch.FloatTensor(X_sample.values[:100])
    X_tensor.requires_grad = True
    
    # Forward pass
    output = model(X_tensor)
    
    # Create dummy target (we want gradients w.r.t inputs)
    dummy_target = torch.ones_like(output)
    
    # Backward pass to get gradients
    model.zero_grad()
    output.backward(dummy_target)
    
    # Get average absolute gradient per feature
    gradients = X_tensor.grad.abs().mean(dim=0).numpy()
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': X_sample.columns.tolist(),
        'gradient_importance': gradients
    }).sort_values('gradient_importance', ascending=False)
    
    print("\n" + "="*80)
    print("GRADIENT-BASED FEATURE IMPORTANCE")
    print("="*80)
    print(f"\nTop 20 features by gradient magnitude:")
    for i, row in importance_df.head(20).iterrows():
        print(f"  {row['feature']:40s} | Gradient: {row['gradient_importance']:.6f}")
    
    return importance_df

# Usage
grad_importance = gradient_based_importance(model, X_region_enhanced, y_dl)


GRADIENT-BASED FEATURE IMPORTANCE

Top 20 features by gradient magnitude:
  had_historical_quotes_tanh               | Gradient: 0.401832
  total_historical_quotes                  | Gradient: 0.376201
  total_historical_quotes_abs_sqrt         | Gradient: 0.345707
  total_historical_quotes_log              | Gradient: 0.330201
  had_historical_quotes_log                | Gradient: 0.325407
  total_historical_quotes_tanh             | Gradient: 0.278469
  had_historical_quotes_abs_sqrt           | Gradient: 0.272736
  region_discount_interaction              | Gradient: 0.251772
  total_quotes                             | Gradient: 0.240863
  avg_price_tanh                           | Gradient: 0.216465
  _total_quotes                            | Gradient: 0.208724
  avg_discount_pct_abs_sqrt                | Gradient: 0.199091
  avg_discount_abs_sqrt                    | Gradient: 0.198241
  peak_weekday                             | Gradient: 0.182736
  avg_price                  