In [1]:
import sys
print(f"Python executable: {sys.executable}")
# Should point to poetry's virtual environment

import wandb
wandb.login()

Python executable: /home/valeriya/project/homeserve/credit_policy/.direnv/python-3.12.0/bin/python


[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/valeriya/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mvslovik[0m ([33mhomeserve[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
import pickle
from etl.util import prepare_dataset_without_leakage

import warnings
warnings.filterwarnings('ignore')

# ============ ADD WANDB IMPORTS ============
import wandb
from wandb.sklearn import plot_roc, plot_confusion_matrix, plot_precision_recall, plot_feature_importances

print("\n" + "="*80)
print("TRAINING MODELS WITHOUT DATA LEAKAGE")
print("="*80)

# ============ INITIALIZE WANDB ============
print("\nüîó Initializing Weights & Biases tracking...")
wandb.init(
    project="france-hvac",  # Your project name
    name="credit-policy-realistic-models",
    config={
        "test_size": 0.2,
        "random_state": 42,
        "n_estimators": 100,
        "max_depth": 8,
        "min_samples_split": 50,
        "class_weight": "balanced",
        "n_jobs": -1,
        "stratify": True
    }
)

# Access config
config = wandb.config
print(f"‚úì Wandb initialized. Run URL: {wandb.run.get_url()}")

print("\nüîÑ Preparing datasets with proper time-based features...")

# Load datasets
customer_lifetime = pd.read_csv('customer_lifetime_data.csv')
opportunity_30day = pd.read_csv('30day_opportunities_data.csv')
product_sessions = pd.read_csv('product_sessions_data.csv')

datasets_fixed = {
    'Customer Lifetime': prepare_dataset_without_leakage(customer_lifetime, 'Customer Lifetime'),
    '30-Day Windows': prepare_dataset_without_leakage(opportunity_30day, '30-Day Windows'),
    'Product Sessions': prepare_dataset_without_leakage(product_sessions, 'Product Sessions')
}

# Re-train models
results_fixed = {}
run_metrics = []  # Store metrics for wandb comparison

for dataset_name, (X, y) in datasets_fixed.items():
    print(f"\nüéØ Re-training {dataset_name} ({len(X):,} samples)...")
    
    # ============ LOG DATASET INFO TO WANDB ============
    wandb.log({
        f"{dataset_name}/dataset_size": len(X),
        f"{dataset_name}/conversion_rate": y.mean(),
        f"{dataset_name}/n_features": X.shape[1]
    })
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=config.test_size, random_state=config.random_state, stratify=y
    )
    
    # Train Random Forest
    rf_model = RandomForestClassifier(
        n_estimators=config.n_estimators,
        max_depth=config.max_depth,
        min_samples_split=config.min_samples_split,
        class_weight=config.class_weight,
        random_state=config.random_state,
        n_jobs=config.n_jobs
    )
    
    rf_model.fit(X_train, y_train)
    
    # Predictions
    rf_y_pred = rf_model.predict(X_test)
    rf_y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    rf_auc = roc_auc_score(y_test, rf_y_pred_proba)
    rf_accuracy = accuracy_score(y_test, rf_y_pred)
    rf_precision = precision_score(y_test, rf_y_pred, zero_division=0)
    rf_recall = recall_score(y_test, rf_y_pred, zero_division=0)
    rf_f1 = f1_score(y_test, rf_y_pred, zero_division=0)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    results_fixed[dataset_name] = {
        'AUC': rf_auc,
        'Accuracy': rf_accuracy,
        'Precision': rf_precision,
        'Recall': rf_recall,
        'F1': rf_f1,
        'Top Features': feature_importance.head(5)['feature'].tolist(),
        'Feature Importances': feature_importance.head(10),
        'Sample Size': len(X),
        'Conversion Rate': y.mean()
    }
    
    # ============ LOG METRICS TO WANDB ============
    wandb.log({
        f"{dataset_name}/auc": rf_auc,
        f"{dataset_name}/accuracy": rf_accuracy,
        f"{dataset_name}/precision": rf_precision,
        f"{dataset_name}/recall": rf_recall,
        f"{dataset_name}/f1": rf_f1,
        f"{dataset_name}/train_size": len(X_train),
        f"{dataset_name}/test_size": len(X_test)
    })
    
    # ============ LOG VISUALIZATIONS TO WANDB ============
    # ROC Curve
    try:
        roc_fig = plot_roc(y_test, rf_y_pred_proba, labels=['No Convert', 'Convert'])
        wandb.log({f"{dataset_name}/roc_curve": roc_fig})
    except:
        print(f"  Note: Could not plot ROC for {dataset_name}")
    
    # Confusion Matrix
    try:
        cm_fig = plot_confusion_matrix(y_test, rf_y_pred, labels=['No Convert', 'Convert'])
        wandb.log({f"{dataset_name}/confusion_matrix": cm_fig})
    except:
        pass
    
    # Feature Importance Plot
    try:
        if len(feature_importance) > 0:
            # Create feature importance table for wandb
            importance_table = wandb.Table(dataframe=feature_importance.head(20))
            wandb.log({f"{dataset_name}/feature_importance": importance_table})
            
            # Log top features as summary
            wandb.run.summary[f"{dataset_name}_top_feature"] = feature_importance.iloc[0]['feature']
            wandb.run.summary[f"{dataset_name}_top_importance"] = feature_importance.iloc[0]['importance']
    except:
        pass
    
    # Store for comparison
    run_metrics.append({
        'dataset': dataset_name,
        'auc': rf_auc,
        'accuracy': rf_accuracy,
        'f1': rf_f1
    })
    
    print(f"  ‚úì Fixed Random Forest AUC: {rf_auc:.3f}")
    print(f"  ‚úì Top 3 features: {feature_importance.head(3)['feature'].tolist()}")

# ============ CREATE COMPARISON VISUALIZATION ============
print("\nüìä Creating model comparison dashboard...")

# Create comparison table for wandb
comparison_data = []
for dataset_name, result in results_fixed.items():
    comparison_data.append([
        dataset_name,
        result['Sample Size'],
        result['Conversion Rate'],
        result['AUC'],
        result['Accuracy'],
        result['F1'],
        result['Top Features'][0] if result['Top Features'] else 'N/A'
    ])

comparison_df = pd.DataFrame(comparison_data, 
    columns=['Dataset', 'Samples', 'Conv_Rate', 'AUC', 'Accuracy', 'F1', 'Top_Feature'])

# Log comparison table
comparison_table = wandb.Table(dataframe=comparison_df)
wandb.log({"model_comparison": comparison_table})

# Log comparison chart data
wandb.log({
    "comparison/auc_by_dataset": wandb.plot.bar(
        comparison_table, "Dataset", "AUC",
        title="AUC by Dataset"
    ),
    "comparison/f1_by_dataset": wandb.plot.bar(
        comparison_table, "Dataset", "F1",
        title="F1 Score by Dataset"
    )
})

# Performance Comparison After Fix
print("\n" + "="*80)
print("REALISTIC PERFORMANCE COMPARISON (NO LEAKAGE)")
print("="*80)

comparison_fixed = []
for dataset_name, result in results_fixed.items():
    comparison_fixed.append({
        'Aggregation Strategy': dataset_name,
        'Samples': f"{result['Sample Size']:,}",
        'Conv Rate': f"{result['Conversion Rate']:.1%}",
        'Realistic AUC': f"{result['AUC']:.3f}",
        'Accuracy': f"{result['Accuracy']:.3f}",
        'F1 Score': f"{result['F1']:.3f}",
        'Top Feature': result['Top Features'][0] if result['Top Features'] else 'N/A'
    })

comparison_fixed_df = pd.DataFrame(comparison_fixed)
comparison_fixed_df = comparison_fixed_df.sort_values('Realistic AUC', ascending=False)

print("\nüìà REALISTIC PERFORMANCE RANKING:")
for i, row in comparison_fixed_df.iterrows():
    print(f"{i+1}. {row['Aggregation Strategy']}:")
    print(f"   Conversion Rate: {row['Conv Rate']}")
    print(f"   Realistic AUC: {row['Realistic AUC']}")
    print(f"   Accuracy: {row['Accuracy']}")
    print(f"   F1 Score: {row['F1 Score']}")
    print(f"   Top Feature: {row['Top Feature']}")
    print()

# Find best realistic strategy
best_realistic_strategy = comparison_fixed_df.iloc[0]['Aggregation Strategy']
best_realistic_auc = float(comparison_fixed_df.iloc[0]['Realistic AUC'])

print("\n" + "="*80)
print("BUSINESS INSIGHTS")
print("="*80)

print(f"\nüèÜ REAL BEST STRATEGY: {best_realistic_strategy}")
print(f"   Realistic AUC: {best_realistic_auc:.3f}")

print(f"\nüîç REAL TOP CONVERSION DRIVERS:")
if best_realistic_strategy in results_fixed:
    top_features = results_fixed[best_realistic_strategy]['Top Features']
    for i, feature in enumerate(top_features[:5], 1):
        print(f"   {i}. {feature}")

# ============ SAVE BEST MODEL AND LOG TO WANDB ============
if best_realistic_strategy in datasets_fixed:
    X, y = datasets_fixed[best_realistic_strategy]
    
    # Train final model on full data
    final_model = RandomForestClassifier(
        n_estimators=config.n_estimators,
        max_depth=config.max_depth,
        min_samples_split=config.min_samples_split,
        class_weight=config.class_weight,
        random_state=config.random_state
    )
    final_model.fit(X, y)
    
    # Save model locally
    model_data = {
        'model': final_model,
        'feature_names': X.columns.tolist(),
        'aggregation_strategy': best_realistic_strategy,
        'realistic_auc': best_realistic_auc,
        'config': dict(config)
    }
    
    model_filename = 'realistic_conversion_model.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(model_data, f)
    
    print(f"\nüíæ Saving best model...")
    print(f"‚úì Model saved: {model_filename}")
    print(f"‚úì Strategy: {best_realistic_strategy}")
    print(f"‚úì Realistic AUC: {best_realistic_auc:.3f}")
    print(f"‚úì Features used: {len(X.columns)}")
    
    # ============ LOG MODEL AS ARTIFACT ============
    print("\nüì§ Logging model to wandb...")
    
    # Create wandb artifact
    artifact = wandb.Artifact(
        name=f"conversion-model-{best_realistic_strategy.lower().replace(' ', '-')}",
        type="model",
        description=f"Random Forest model for {best_realistic_strategy} with AUC {best_realistic_auc:.3f}",
        metadata={
            "strategy": best_realistic_strategy,
            "auc": best_realistic_auc,
            "n_features": len(X.columns),
            "n_samples": len(X),
            "conversion_rate": y.mean()
        }
    )
    
    # Add model file
    artifact.add_file(model_filename)
    
    # Add feature importance CSV
    feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    feature_importance_filename = 'feature_importance.csv'
    feature_importance_df.to_csv(feature_importance_filename, index=False)
    artifact.add_file(feature_importance_filename)
    
    # Log artifact
    wandb.log_artifact(artifact)
    
    print(f"‚úì Model logged to wandb as artifact")
    print(f"‚úì Feature importance saved: {feature_importance_filename}")

# ============ LOG FINAL SUMMARY METRICS ============
wandb.run.summary["best_strategy"] = best_realistic_strategy
wandb.run.summary["best_auc"] = best_realistic_auc
wandb.run.summary["n_datasets_evaluated"] = len(datasets_fixed)
wandb.run.summary["total_samples"] = sum([len(X) for X, _ in datasets_fixed.values()])

# ============ FINISH WANDB RUN ============
print("\n" + "="*80)
print("WANDB TRACKING COMPLETE")
print("="*80)
print(f"‚úÖ Run complete! View results at: {wandb.run.get_url()}")
print(f"üìä Project: france-hvac")
print(f"üè∑Ô∏è  Run name: credit-policy-realistic-models")

wandb.finish()
print("‚úì Wandb run finished successfully")


TRAINING MODELS WITHOUT DATA LEAKAGE

üîó Initializing Weights & Biases tracking...




‚úì Wandb initialized. Run URL: https://wandb.ai/homeserve/france-hvac/runs/cdt74l71

üîÑ Preparing datasets with proper time-based features...

üìä Re-preparing Customer Lifetime without leakage...
  Features after removing leakage: 10
  Removed leaky features: []

üìä Re-preparing 30-Day Windows without leakage...
  Features after removing leakage: 11
  Removed leaky features: ['customer_total_sales', 'customer_avg_price', 'products_considered']

üìä Re-preparing Product Sessions without leakage...
  Features after removing leakage: 11
  Removed leaky features: ['customer_total_sales', 'customer_product_variety', 'product_types']

üéØ Re-training Customer Lifetime (25,930 samples)...
  Note: Could not plot ROC for Customer Lifetime
  ‚úì Fixed Random Forest AUC: 0.660
  ‚úì Top 3 features: ['customer_duration_days', 'quotes_per_month', 'min_quote_amount']

üéØ Re-training 30-Day Windows (28,648 samples)...
  Note: Could not plot ROC for 30-Day Windows
  ‚úì Fixed Random Forest 

0,1
30-Day Windows/accuracy,‚ñÅ
30-Day Windows/auc,‚ñÅ
30-Day Windows/conversion_rate,‚ñÅ
30-Day Windows/dataset_size,‚ñÅ
30-Day Windows/f1,‚ñÅ
30-Day Windows/n_features,‚ñÅ
30-Day Windows/precision,‚ñÅ
30-Day Windows/recall,‚ñÅ
30-Day Windows/test_size,‚ñÅ
30-Day Windows/train_size,‚ñÅ

0,1
30-Day Windows/accuracy,0.60157
30-Day Windows/auc,0.61843
30-Day Windows/conversion_rate,0.39298
30-Day Windows/dataset_size,28648
30-Day Windows/f1,0.50998
30-Day Windows/n_features,11
30-Day Windows/precision,0.49356
30-Day Windows/recall,0.52753
30-Day Windows/test_size,5730
30-Day Windows/train_size,22918


‚úì Wandb run finished successfully
