In [1]:
print("="*80)
print("IMPLEMENTING 3 AGGREGATION STRATEGIES")
print("="*80)

# Load the clean data
import pandas as pd
import numpy as np
from datetime import timedelta

df_clean = pd.read_csv('cleaned_quote_data.csv')
df_clean['dt_creation_devis'] = pd.to_datetime(df_clean['dt_creation_devis'])
print(f"Loaded clean data: {len(df_clean):,} quotes from {df_clean['numero_compte'].nunique():,} customers")

# Strategy 1: Customer Lifetime Aggregation
print("\n" + "="*80)
print("STRATEGY 1: CUSTOMER LIFETIME (Ever bought?)")
print("="*80)

def aggregate_customer_lifetime(df):
    """
    Aggregate at customer level: Did they EVER buy from us?
    """
    # Group by customer
    customer_data = df.groupby('numero_compte').agg({
        'fg_devis_accepte': 'max',  # 1 if any quote converted
        'mt_apres_remise_ht_devis': ['mean', 'min', 'max', 'std', 'count'],
        'nom_region': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'missing',
        'nom_agence': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'missing',
        'famille_equipement_produit': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'missing',
        'dt_creation_devis': ['min', 'max']  # First and last quote dates
    })
    
    # Flatten column names
    customer_data.columns = [
        'converted',
        'avg_quote_amount', 'min_quote_amount', 'max_quote_amount', 
        'std_quote_amount', 'total_quotes',
        'main_region', 'main_agency', 'main_product_family',
        'first_quote_date', 'last_quote_date'
    ]
    
    # Calculate additional features
    customer_data = customer_data.reset_index()
    customer_data['customer_duration_days'] = (
        pd.to_datetime(customer_data['last_quote_date']) - 
        pd.to_datetime(customer_data['first_quote_date'])
    ).dt.days + 1
    
    customer_data['quotes_per_month'] = (
        customer_data['total_quotes'] / (customer_data['customer_duration_days'] / 30.44)
    ).fillna(0)
    
    customer_data['price_range'] = customer_data['max_quote_amount'] - customer_data['min_quote_amount']
    
    return customer_data

# Apply Strategy 1
customer_lifetime_df = aggregate_customer_lifetime(df_clean)
print(f"Customer-level dataset created: {len(customer_lifetime_df):,} customers")
print(f"Conversion rate: {customer_lifetime_df['converted'].mean():.1%}")
print(f"Avg quotes per customer: {customer_lifetime_df['total_quotes'].mean():.2f}")
print(f"Sample features: {list(customer_lifetime_df.columns)[:8]}...")

# Strategy 2: 30-Day Rolling Windows
print("\n" + "="*80)
print("STRATEGY 2: 30-DAY ROLLING WINDOWS")
print("="*80)

def aggregate_30day_windows(df, window_days=30):
    """
    Group quotes within 30 days as one opportunity
    """
    # Sort by customer and date
    df = df.sort_values(['numero_compte', 'dt_creation_devis']).copy()
    
    # Calculate days since last quote for each customer
    df['days_since_last'] = df.groupby('numero_compte')['dt_creation_devis'].diff().dt.days
    
    # New opportunity if gap > 30 days or first quote
    df['new_opportunity'] = (df['days_since_last'] > window_days) | df['days_since_last'].isna()
    
    # Create opportunity IDs
    df['opportunity_id'] = df.groupby('numero_compte')['new_opportunity'].cumsum()
    df['customer_opportunity_id'] = df['numero_compte'] + '_opp_' + df['opportunity_id'].astype(str)
    
    # Aggregate by opportunity
    opportunity_data = df.groupby(['numero_compte', 'opportunity_id', 'customer_opportunity_id']).agg({
        'fg_devis_accepte': 'max',  # 1 if any quote in opportunity converted
        'mt_apres_remise_ht_devis': ['mean', 'min', 'max', 'count', 'std'],
        'dt_creation_devis': ['min', 'max'],  # Opportunity timeframe
        'famille_equipement_produit': lambda x: list(x.unique()),  # Products considered
        'nom_region': 'first',  # Assuming region constant per opportunity
        'nom_agence': 'first'   # Assuming agency constant
    })
    
    # Flatten columns
    opportunity_data.columns = [
        'converted',
        'avg_price', 'min_price', 'max_price', 'quote_count', 'price_std',
        'start_date', 'end_date',
        'products_considered', 'region', 'agency'
    ]
    
    opportunity_data = opportunity_data.reset_index()
    
    # Calculate opportunity features
    opportunity_data['opportunity_duration_days'] = (
        pd.to_datetime(opportunity_data['end_date']) - 
        pd.to_datetime(opportunity_data['start_date'])
    ).dt.days + 1
    
    opportunity_data['quotes_per_day'] = opportunity_data['quote_count'] / opportunity_data['opportunity_duration_days']
    opportunity_data['price_range'] = opportunity_data['max_price'] - opportunity_data['min_price']
    opportunity_data['product_variety'] = opportunity_data['products_considered'].apply(len)
    
    # Add customer-level context
    customer_stats = df.groupby('numero_compte').agg({
        'fg_devis_accepte': 'sum',
        'mt_apres_remise_ht_devis': 'mean'
    }).rename(columns={
        'fg_devis_accepte': 'customer_total_sales',
        'mt_apres_remise_ht_devis': 'customer_avg_price'
    })
    
    opportunity_data = opportunity_data.merge(customer_stats, on='numero_compte', how='left')
    
    return opportunity_data

# Apply Strategy 2
opportunity_30day_df = aggregate_30day_windows(df_clean, window_days=30)
print(f"30-day opportunities created: {len(opportunity_30day_df):,} opportunities")
print(f"Conversion rate: {opportunity_30day_df['converted'].mean():.1%}")
print(f"Avg quotes per opportunity: {opportunity_30day_df['quote_count'].mean():.2f}")
print(f"Avg opportunity duration: {opportunity_30day_df['opportunity_duration_days'].mean():.1f} days")

# Strategy 3: Product Session Groups
print("\n" + "="*80)
print("STRATEGY 3: PRODUCT SESSION GROUPS")
print("="*80)

def aggregate_product_sessions(df):
    """
    Group quotes by product family changes
    New opportunity when product family changes
    """
    # Sort by customer and date
    df = df.sort_values(['numero_compte', 'dt_creation_devis']).copy()
    
    # Track product family changes
    df['prev_product_family'] = df.groupby('numero_compte')['famille_equipement_produit'].shift()
    df['product_changed'] = df['famille_equipement_produit'] != df['prev_product_family']
    
    # Also consider time gaps > 90 days as new session
    df['days_since_last'] = df.groupby('numero_compte')['dt_creation_devis'].diff().dt.days
    df['long_gap'] = df['days_since_last'] > 90
    
    # New session if product changed OR long gap OR first quote
    df['new_session'] = df['product_changed'] | df['long_gap'] | df['days_since_last'].isna()
    
    # Create session IDs
    df['session_id'] = df.groupby('numero_compte')['new_session'].cumsum()
    df['customer_session_id'] = df['numero_compte'] + '_session_' + df['session_id'].astype(str)
    
    # Aggregate by session
    session_data = df.groupby(['numero_compte', 'session_id', 'customer_session_id']).agg({
        'fg_devis_accepte': 'max',  # 1 if any quote in session converted
        'mt_apres_remise_ht_devis': ['mean', 'min', 'max', 'count', 'std'],
        'dt_creation_devis': ['min', 'max'],
        'famille_equipement_produit': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'mixed',
        'type_equipement_produit': lambda x: list(x.unique())[:3],  # Top 3 product types
        'nom_region': 'first',
        'nom_agence': 'first'
    })
    
    # Flatten columns
    session_data.columns = [
        'converted',
        'avg_price', 'min_price', 'max_price', 'quote_count', 'price_std',
        'start_date', 'end_date',
        'main_product_family', 'product_types', 'region', 'agency'
    ]
    
    session_data = session_data.reset_index()
    
    # Calculate session features
    session_data['session_duration_days'] = (
        pd.to_datetime(session_data['end_date']) - 
        pd.to_datetime(session_data['start_date'])
    ).dt.days + 1
    
    session_data['quotes_per_day'] = session_data['quote_count'] / session_data['session_duration_days']
    session_data['price_range'] = session_data['max_price'] - session_data['min_price']
    session_data['product_type_count'] = session_data['product_types'].apply(len)
    
    # Add customer-level context
    customer_stats = df.groupby('numero_compte').agg({
        'fg_devis_accepte': 'sum',
        'famille_equipement_produit': 'nunique'
    }).rename(columns={
        'fg_devis_accepte': 'customer_total_sales',
        'famille_equipement_produit': 'customer_product_variety'
    })
    
    session_data = session_data.merge(customer_stats, on='numero_compte', how='left')
    
    return session_data

# Apply Strategy 3
product_sessions_df = aggregate_product_sessions(df_clean)
print(f"Product sessions created: {len(product_sessions_df):,} sessions")
print(f"Conversion rate: {product_sessions_df['converted'].mean():.1%}")
print(f"Avg quotes per session: {product_sessions_df['quote_count'].mean():.2f}")
print(f"Avg session duration: {product_sessions_df['session_duration_days'].mean():.1f} days")

# Summary Comparison
print("\n" + "="*80)
print("SUMMARY: ALL 3 STRATEGIES COMPARISON")
print("="*80)

summary_data = {
    'Strategy': ['Customer Lifetime', '30-Day Windows', 'Product Sessions'],
    'Units': [len(customer_lifetime_df), len(opportunity_30day_df), len(product_sessions_df)],
    'Conversion Rate': [
        f"{customer_lifetime_df['converted'].mean():.1%}",
        f"{opportunity_30day_df['converted'].mean():.1%}",
        f"{product_sessions_df['converted'].mean():.1%}"
    ],
    'Avg Quotes/Unit': [
        f"{customer_lifetime_df['total_quotes'].mean():.2f}",
        f"{opportunity_30day_df['quote_count'].mean():.2f}",
        f"{product_sessions_df['quote_count'].mean():.2f}"
    ],
    'Sample Size': [
        f"{len(customer_lifetime_df):,}",
        f"{len(opportunity_30day_df):,}",
        f"{len(product_sessions_df):,}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Save all datasets for Day 3
print("\nðŸ’¾ SAVING AGGREGATED DATASETS")
customer_lifetime_df.to_csv('customer_lifetime_data.csv', index=False)
opportunity_30day_df.to_csv('30day_opportunities_data.csv', index=False)
product_sessions_df.to_csv('product_sessions_data.csv', index=False)

IMPLEMENTING 3 AGGREGATION STRATEGIES
Loaded clean data: 38,333 quotes from 25,930 customers

STRATEGY 1: CUSTOMER LIFETIME (Ever bought?)
Customer-level dataset created: 25,930 customers
Conversion rate: 41.2%
Avg quotes per customer: 1.48
Sample features: ['numero_compte', 'converted', 'avg_quote_amount', 'min_quote_amount', 'max_quote_amount', 'std_quote_amount', 'total_quotes', 'main_region']...

STRATEGY 2: 30-DAY ROLLING WINDOWS
30-day opportunities created: 28,648 opportunities
Conversion rate: 39.3%
Avg quotes per opportunity: 1.34
Avg opportunity duration: 2.0 days

STRATEGY 3: PRODUCT SESSION GROUPS
Product sessions created: 30,109 sessions
Conversion rate: 38.2%
Avg quotes per session: 1.27
Avg session duration: 3.1 days

SUMMARY: ALL 3 STRATEGIES COMPARISON
         Strategy  Units Conversion Rate Avg Quotes/Unit Sample Size
Customer Lifetime  25930           41.2%            1.48      25,930
   30-Day Windows  28648           39.3%            1.34      28,648
 Product Sess