In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from ml_features.features import create_features

import warnings
warnings.filterwarnings('ignore')

# Load data
df_quotes = pd.read_csv('cleaned_quote_data.csv')
df_quotes['dt_creation_devis'] = pd.to_datetime(df_quotes['dt_creation_devis'])
df_quotes['dt_signature_devis'] = pd.to_datetime(df_quotes['dt_signature_devis'])

In [2]:
# 1. Customer Split (95%/5%)
# --------------------------

# First quote per customer
cust_first = df_quotes.groupby('numero_compte')['dt_creation_devis'].min().reset_index()
cust_first = cust_first.sort_values('dt_creation_devis')

# 95th percentile split
split_idx = int(len(cust_first) * 0.95)
split_date = cust_first.iloc[split_idx]['dt_creation_devis']

# Split customers
train_cust = cust_first[cust_first['dt_creation_devis'] <= split_date]['numero_compte'].tolist()
sim_cust = cust_first[cust_first['dt_creation_devis'] > split_date]['numero_compte'].tolist()

# Split data
df_train = df_quotes[df_quotes['numero_compte'].isin(train_cust)].copy()
df_sim = df_quotes[df_quotes['numero_compte'].isin(sim_cust)].copy()

print(f"Split: {len(train_cust)} train, {len(sim_cust)} sim customers")

Split: 22708 train, 1180 sim customers


In [3]:
# 2. Model Training (Silent)
# --------------------------
from ml_features.features import create_features
from ml_training.train_xgb import train_xgb
import sys
import os

# Suppress create_features output
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# Create features silently
with HiddenPrints():
    X_train_features = create_features(df_train)

# Prepare target
if 'converted' not in X_train_features.columns:
    if 'fg_devis_accepte' in X_train_features.columns:
        X_train_features['converted'] = (X_train_features['fg_devis_accepte'] == 1).astype(int)
    else:
        # Calculate from raw data
        cust_conv = df_train.groupby('numero_compte')['fg_devis_accepte'].max().reset_index()
        cust_conv.columns = ['numero_compte', 'converted']
        X_train_features = pd.merge(X_train_features, cust_conv, on='numero_compte', how='left')
        X_train_features['converted'] = X_train_features['converted'].fillna(0).astype(int)

y_train = X_train_features['converted']

# Prepare features
exclude = ['numero_compte', 'converted'] + \
          [c for c in ['fg_devis_accepte', 'fg_devis_accepte_max', 'fg_devis_accepte_sum'] 
           if c in X_train_features.columns]

X_train = X_train_features.drop(exclude, axis=1)
feature_names = X_train.columns.tolist()

# Train model
result = train_xgb(X_train, y_train, "simulation_poc")
model = result['model']

print(f"Model trained: {len(feature_names)} features")

✓ Model saved: simulation_poc.pkl
✓ AUC: 0.729
✓ F1 Score: 0.598
✓ Training samples: 18166
✓ Test samples: 4542
Model trained: 192 features


In [4]:
# 3. Select Simulation Customers
# ------------------------------
# Filter non-converted customers from simulation pool
sim_conv = df_sim.groupby('numero_compte')['fg_devis_accepte'].max()
non_conv_customers = sim_conv[sim_conv == 0].index.tolist()

# Get quote counts
quote_counts = df_sim.groupby('numero_compte').size()

# Eligible customers with their quote counts
eligible = [(cust, quote_counts.get(cust, 0)) for cust in non_conv_customers 
            if quote_counts.get(cust, 0) >= 1]

# Stratified sample
import random
random.seed(42)

single_quote = [c for c, cnt in eligible if cnt == 1]
multi_quote = [c for c, cnt in eligible if cnt >= 2]

selected = []
if len(single_quote) >= 2:
    selected.extend(random.sample(single_quote, 2))
if len(multi_quote) >= 3:
    selected.extend(random.sample(multi_quote, 3))

selected = selected[:5]

print(f"Selected {len(selected)} non-converted customers")

Selected 5 non-converted customers


In [None]:
# 4. Baseline Predictions (Fixed)
# -------------------------------
baseline_results = []

for cust_id in selected:
    cust_quotes = df_sim[df_sim['numero_compte'] == cust_id]
    
    with HiddenPrints():
        # Process all selected customers together once
        all_selected_quotes = df_sim[df_sim['numero_compte'].isin(selected)]
        features = create_features(all_selected_quotes)
    
    cust_features = features[features['numero_compte'] == cust_id]
    
    # If no features created, create minimal
    if len(cust_features) == 0:
        cust_features = pd.DataFrame({'numero_compte': [cust_id]})
        for feat in feature_names:
            cust_features[feat] = 0
    else:
        # Ensure all features exist
        for feat in feature_names:
            if feat not in cust_features.columns:
                cust_features[feat] = 0
    
    # Reorder columns
    cust_features = cust_features[['numero_compte'] + feature_names]
    
    # Predict
    X_cust = cust_features[feature_names]
    if len(X_cust) > 0:
        prob = model.predict_proba(X_cust)[:, 1][0]
    else:
        prob = 0.5  # Default
    
    baseline_results.append({
        'customer_id': cust_id,
        'quote_count': len(cust_quotes),
        'baseline_prob': prob
    })

baseline_df = pd.DataFrame(baseline_results)
print(f"Baseline: {len(baseline_df)} customers")

In [8]:
# 5. Feature Change Analysis (Short)
# ----------------------------------
# Test if features change with actions
test_cust = selected[0]
test_quotes = df_sim[df_sim['numero_compte'] == test_cust]

# Apply changes
discount_modified = test_quotes.copy()
discount_modified['mt_remise_exceptionnelle_ht'] -= 100

product_modified = test_quotes.copy()
product_modified['famille_equipement_produit'] = 'Pompe à chaleur'

# Create features
with HiddenPrints():
    other = df_sim[df_sim['numero_compte'].isin([c for c in selected if c != test_cust])]
    
    orig = create_features(pd.concat([test_quotes, other]))
    disc = create_features(pd.concat([discount_modified, other]))
    prod = create_features(pd.concat([product_modified, other]))

# Check key features
key_features = ['avg_discount_pct', 'latest_equipment', 'quote_consistency_score']

print("Feature changes:")
for feat in key_features:
    if feat in orig.columns:
        o = orig[orig['numero_compte'] == test_cust][feat].iloc[0]
        d = disc[disc['numero_compte'] == test_cust][feat].iloc[0] if feat in disc.columns else o
        p = prod[prod['numero_compte'] == test_cust][feat].iloc[0] if feat in prod.columns else o
        
        print(f"{feat}: {o:.3f} → discount:{d:.3f} product:{p:.3f}")

Feature changes:
avg_discount_pct: 0.000 → discount:-0.010 product:0.000
latest_equipment: 3.000 → discount:3.000 product:3.000
quote_consistency_score: 1.000 → discount:1.000 product:1.000


In [6]:
# 5. Feature Change Analysis (Fixed)
# ----------------------------------
print("Step 5: Analyzing how raw changes affect engineered features")

# Test with one customer
test_cust = selected[0]
test_quotes = df_sim[df_sim['numero_compte'] == test_cust].copy()

print(f"\nTesting customer {test_cust}:")
print(f"Original discount: €{test_quotes['mt_remise_exceptionnelle_ht'].sum():.2f}")
print(f"Original product: {test_quotes['famille_equipement_produit'].iloc[0]}")
print(f"Quote count: {len(test_quotes)}")

# Create features for ALL selected customers together (to ensure feature creation)
all_selected_quotes = df_sim[df_sim['numero_compte'].isin(selected)]

with HiddenPrints():
    all_features = create_features(all_selected_quotes)

print(f"\nFeatures created for all selected customers:")
print(f"Feature shape: {all_features.shape}")
print(f"Customers in features: {all_features['numero_compte'].nunique()}")

# Check if our test customer is in the features
cust_in_features = test_cust in all_features['numero_compte'].values
print(f"Test customer in features: {cust_in_features}")

if cust_in_features:
    orig_cust = all_features[all_features['numero_compte'] == test_cust].iloc[0]
    
    print("\n" + "="*60)
    print("SAMPLE OF ENGINEERED FEATURES FOR THIS CUSTOMER:")
    print("="*60)
    
    # Show some key features
    sample_features = [
        'total_quotes', 'avg_price', 'max_price', 'avg_discount_pct',
        'equipment_variety_count', 'brand_loyalty_index'
    ]
    
    for feat in sample_features:
        if feat in orig_cust:
            print(f"{feat}: {orig_cust[feat]}")
    
    # Check if avg_discount_pct exists
    if 'avg_discount_pct' in orig_cust:
        print(f"\navg_discount_pct: {orig_cust['avg_discount_pct']}")
        print("This is what changes when we modify discount amount")
    else:
        print("\n⚠️ avg_discount_pct not in features!")
        
    # Check model's top features
    print("\n" + "="*60)
    print("MODEL'S TOP 5 FEATURES (should change with our actions):")
    print("="*60)
    
    top_5 = feature_importance.head(5)['feature'].tolist()
    for feat in top_5:
        if feat in orig_cust:
            print(f"{feat}: {orig_cust[feat]:.6f}")
        else:
            print(f"{feat}: NOT IN FEATURES ⚠️")

else:
    print(f"\n⚠️ CRITICAL: Customer {test_cust} not in features!")
    print("This explains why scenarios have no impact.")
    print("\nChecking feature columns:")
    print(f"Feature columns: {len(all_features.columns)}")
    print(f"Sample columns: {list(all_features.columns[:10])}")

Step 5: Analyzing how raw changes affect engineered features

Testing customer CL00345652:
Original discount: €0.00
Original product: Poêle
Quote count: 1

Features created for all selected customers:
Feature shape: (5, 193)
Customers in features: 5
Test customer in features: True

SAMPLE OF ENGINEERED FEATURES FOR THIS CUSTOMER:
total_quotes: 1
avg_price: 9997.16
max_price: 9997.16
avg_discount_pct: 0.0
equipment_variety_count: 1
brand_loyalty_index: 1.0

avg_discount_pct: 0.0
This is what changes when we modify discount amount

MODEL'S TOP 5 FEATURES (should change with our actions):


NameError: name 'feature_importance' is not defined

In [7]:
# Debug: Why customer isn't in features
print("Debugging feature creation...")

# Check which customers ARE in features
customers_in_features = all_features['numero_compte'].unique()
print(f"\nCustomers in features: {customers_in_features}")

# Check our selected customers
print(f"\nOur selected customers: {selected}")

# Find mismatch
missing = [c for c in selected if c not in customers_in_features]
print(f"\nMissing from features: {missing}")

if missing:
    # Check data for missing customer
    missing_cust = missing[0]
    missing_data = df_sim[df_sim['numero_compte'] == missing_cust]
    
    print(f"\nData for missing customer {missing_cust}:")
    print(f"Rows: {len(missing_data)}")
    print(f"Columns: {len(missing_data.columns)}")
    
    # Check specific columns that might cause issues
    critical_cols = ['famille_equipement_produit', 'mt_remise_exceptionnelle_ht', 
                     'dt_creation_devis', 'fg_devis_accepte']
    
    print(f"\nCritical column check:")
    for col in critical_cols:
        exists = col in missing_data.columns
        null_count = missing_data[col].isnull().sum() if exists else 'N/A'
        print(f"  {col}: {'✓' if exists else '✗'} (null: {null_count})")
    
    # Try creating features for JUST this customer
    print(f"\nTrying to create features for single customer...")
    with HiddenPrints():
        single_features = create_features(missing_data)
    
    print(f"Single customer features shape: {single_features.shape}")
    if len(single_features) > 0:
        print(f"Customer now in features: {missing_cust in single_features['numero_compte'].values}")
    else:
        print("⚠️ No features created even for single customer!")

Debugging feature creation...

Customers in features: ['CL00130455' 'CL00270457' 'CL00335136' 'CL00345183' 'CL00345652']

Our selected customers: ['CL00345652', 'CL00270457', 'CL00130455', 'CL00345183', 'CL00335136']

Missing from features: []
