In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from etl.util import prepare_dataset_without_leakage
from ml_features.features import prepare_features
from ml_features.customer_features import create_customer_features
from ml_features.sequence_features  import create_sequence_features
from ml_features.brand_features import create_brand_features
from ml_features.model_features import create_model_features
from ml_features.market_features import create_market_features
from ml_features.equipment_features import create_equipment_features
from ml_features.solution_complexity_features import create_solution_complexity_features
from ml_features.timeline_features import create_timeline_features, create_advanced_timeline_features, create_timeline_interaction_features
from ml_features.role_features import create_commercial_role_features
from ml_features.process_features import create_process_features
from ml_features.correction_features import create_correction_features
from ml_training.train_rf import train_rf
from ml_evaluation.dashboard import model_evaluation_report
from dl_evaluation.evaluation import analyze_input_feature_importance

import warnings
warnings.filterwarnings('ignore')

# Load original clean quote data
df_quotes = pd.read_csv('cleaned_quote_data.csv')
df_quotes['dt_creation_devis'] = pd.to_datetime(df_quotes['dt_creation_devis'])

print(f"\nüìä Original quote data: {len(df_quotes):,} quotes from {df_quotes['numero_compte'].nunique():,} customers")

print("\n" + "="*80)
print("STRATEGY: CREATE FEATURES")
print("="*80)

# Create feature list
feature_funcs = [create_customer_features, create_sequence_features, create_brand_features, 
                 create_model_features, create_market_features,
                 create_equipment_features, create_solution_complexity_features,
                 create_timeline_features, create_advanced_timeline_features,
                 create_commercial_role_features, create_process_features, create_correction_features]


new_df = feature_funcs[0](df_quotes)
customer_df = new_df
for func in feature_funcs[1:]:
    new_df_ = func(df_quotes)

    new_df = pd.merge(new_df, new_df_, on='numero_compte', how='left', suffixes=('_dup', ''))
    new_df = new_df.drop(columns=[x for x in new_df.columns if '_dup' in x], errors='ignore')
    print(len(new_df))
    if func == create_sequence_features: sequence_df = new_df


# Now it's clear which column is which
y_new = new_df['converted']  # From sequence features
y_sequence = sequence_df['converted']  # From sequence features
y_customer = customer_df['converted']  # From customer features

# For modeling, use the sequence version
X_customer = customer_df.drop(columns=['numero_compte', 'converted'], errors='ignore')
X_customer_clean, y_customer_clean = prepare_features(X_customer, y_customer, "Customer Features")

columns_to_drop =  [x for x in sequence_df.columns if '_seq' in x]
columns_to_drop.extend(['numero_compte', 'converted'])
X_sequence = sequence_df.drop(columns=columns_to_drop, errors='ignore')
X_sequence_clean, y_sequence_clean = prepare_features(X_sequence, y_sequence, "Sequence Features")

new_df = create_timeline_interaction_features(new_df)
X_new = new_df.drop(columns=['numero_compte', 'converted'], errors='ignore')
X_new_clean, y_new_clean = prepare_features(X_new, y_new, "New Features")

from dl_training.train import train_advanced_dl_model
from dl_features.features import create_dl_specific_features, create_focused_features, enhance_region_features, enhance_discount_features

X_dl, y_dl = create_dl_specific_features(X_new_clean, y_new_clean)

result = train_advanced_dl_model(X_dl, y_dl)


üìä Original quote data: 34,014 quotes from 23,888 customers

STRATEGY: CREATE FEATURES
Creating OPTIMIZED customer features (mode: first_conversion)...
  Filtering post-first-purchase data...
  Customers: 23,888, Quotes: 33,247
  Calculating features...
  Calculating price trajectory (optimized)...
‚úì Created 14 leakage-free features
‚Üí 23,888 customers | 39.6% converters
‚è±Ô∏è  Execution time: 10.2 seconds
‚ö†Ô∏è  10.2s (target was 3s)
CREATING FIRST CONVERSION PREDICTION FEATURES (LEAKAGE-FREE)
  Total customers: 23,888
‚ö° Processing customers with corrected first-conversion logic...
  Processed 0/23,888 customers
  Processed 5,000/23,888 customers
  Processed 10,000/23,888 customers
  Processed 15,000/23,888 customers
  Processed 20,000/23,888 customers
‚úÖ First-conversion features calculation complete

üîç VALIDATION REPORT:
   Total customers: 23,888
   First converters: 9,458 (39.6%)
   Never converters: 14,430

üìä Distribution check:
   Converters with 0 historical qu

In [3]:
model = result['model']
X_test = result['X_test']
importance_df = analyze_input_feature_importance(model, X_test)


GRADIENT-BASED INPUT FEATURE IMPORTANCE

Top 20 input features by gradient magnitude:
  total_historical_quotes_tanh             | Gradient: 0.464343
  total_historical_quotes_abs_sqrt         | Gradient: 0.421016
  total_historical_quotes_log              | Gradient: 0.375627
  total_historical_quotes                  | Gradient: 0.345736
  had_historical_quotes_tanh               | Gradient: 0.341350
  had_historical_quotes                    | Gradient: 0.339938
  min_price                                | Gradient: 0.306772
  primary_brand_log                        | Gradient: 0.262006
  min_price_tanh                           | Gradient: 0.260713
  avg_price_tanh                           | Gradient: 0.255805
  avg_current_price_tanh                   | Gradient: 0.249723
  max_price_tanh                           | Gradient: 0.234198
  avg_price_log                            | Gradient: 0.212739
  main_region                              | Gradient: 0.203347
  quote_count    