In [1]:
# ==============================================
# STEP 1: LOAD & PREPARE ACTUAL MARGIN DATA
# ==============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.neighbors import NearestNeighbors

print("="*80)
print("STEP 1: USING ACTUAL mt_marge DATA")
print("="*80)

# Load and prepare data
df = pd.read_csv('cleaned_quote_data.csv')
df['dt_creation_devis'] = pd.to_datetime(df['dt_creation_devis'])

# Create product categories
df['product_category'] = 'Other'
df.loc[df['famille_equipement_produit'].str.contains('chaudiÃ¨re', case=False, na=False), 'product_category'] = 'Gas Boiler'
df.loc[df['famille_equipement_produit'].str.contains('pompe Ã  chaleur', case=False, na=False), 'product_category'] = 'Heat Pump'

analysis_df = df[df['product_category'].isin(['Gas Boiler', 'Heat Pump'])].copy()

# Define wins
win_statuses = ['ValidÃ©, signÃ©, rÃ©alisÃ© et conforme', 
                'ValidÃ©,signÃ© mais abandonnÃ©',
                "ValidÃ©, signÃ© - en attente d'intervention"]
analysis_df['is_win'] = analysis_df['statut_devis'].isin(win_statuses)

STEP 1: USING ACTUAL mt_marge DATA


In [2]:
# Let me examine ALL mt_ columns in your data
print("="*80)
print("ANALYZING ALL mt_ (montant) COLUMNS")
print("="*80)

# List all mt_ columns
mt_columns = [col for col in df.columns if col.startswith('mt_')]
print(f"\nðŸ’° Found {len(mt_columns)} monetary columns:")

for i, col in enumerate(mt_columns, 1):
    non_null = df[col].notna().sum()
    pct_non_null = non_null / len(df) * 100
    print(f"{i:2d}. {col:35s} {non_null:6,} values ({pct_non_null:.1f}% non-null)")

# Examine key columns
print("\n" + "="*80)
print("KEY MARGIN-RELATED COLUMNS ANALYSIS")
print("="*80)

key_columns = ['mt_marge', 'mt_apres_remise_ht_devis', 'mt_remise_exceptionnelle_ht', 
               'mt_prime_cee', 'mt_prime_maprimerenov', 'mt_ttc_apres_aide_devis',
               'mt_ttc_avant_aide_devis']

for col in key_columns:
    if col in df.columns:
        print(f"\nðŸ“Š {col}:")
        print(f"   Non-null: {df[col].notna().sum():,} ({df[col].notna().mean()*100:.1f}%)")
        if df[col].notna().sum() > 0:
            print(f"   Mean: {df[col].mean():,.2f}â‚¬")
            print(f"   Median: {df[col].median():,.2f}â‚¬")
            print(f"   Min: {df[col].min():,.2f}â‚¬")
            print(f"   Max: {df[col].max():,.2f}â‚¬")

ANALYZING ALL mt_ (montant) COLUMNS

ðŸ’° Found 9 monetary columns:
 1. mt_apres_remise_ht_devis            34,014 values (100.0% non-null)
 2. mt_marge                            33,900 values (99.7% non-null)
 3. mt_apres_remise_ht_emis_devis       33,445 values (98.3% non-null)
 4. mt_marge_emis_devis                 33,331 values (98.0% non-null)
 5. mt_remise_exceptionnelle_ht         34,014 values (100.0% non-null)
 6. mt_ttc_apres_aide_devis             34,014 values (100.0% non-null)
 7. mt_ttc_avant_aide_devis             34,014 values (100.0% non-null)
 8. mt_prime_cee                        32,818 values (96.5% non-null)
 9. mt_prime_maprimerenov               32,314 values (95.0% non-null)

KEY MARGIN-RELATED COLUMNS ANALYSIS

ðŸ“Š mt_marge:
   Non-null: 33,900 (99.7%)
   Mean: 3,396.25â‚¬
   Median: 2,539.02â‚¬
   Min: -8,121.64â‚¬
   Max: 174,761.60â‚¬

ðŸ“Š mt_apres_remise_ht_devis:
   Non-null: 34,014 (100.0%)
   Mean: 7,423.83â‚¬
   Median: 5,656.49â‚¬
   Min: 1,000.00