In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/valeriya/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mvslovik[0m ([33mhomeserve[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
import pickle
from etl.util import prepare_dataset_without_leakage


import warnings
warnings.filterwarnings('ignore')


print("\n" + "="*80)
print("TRAINING MODELS WITHOUT DATA LEAKAGE")
print("="*80)

print("\nüîÑ Preparing datasets with proper time-based features...")


# Load datasets
customer_lifetime = pd.read_csv('customer_lifetime_data.csv')
opportunity_30day = pd.read_csv('30day_opportunities_data.csv')
product_sessions = pd.read_csv('product_sessions_data.csv')

datasets_fixed = {
    'Customer Lifetime': prepare_dataset_without_leakage(customer_lifetime, 'Customer Lifetime'),
    '30-Day Windows': prepare_dataset_without_leakage(opportunity_30day, '30-Day Windows'),
    'Product Sessions': prepare_dataset_without_leakage(product_sessions, 'Product Sessions')
}

# Re-train models
results_fixed = {}

for dataset_name, (X, y) in datasets_fixed.items():
    print(f"\nüéØ Re-training {dataset_name} ({len(X):,} samples)...")
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train Random Forest (better for non-linear relationships)
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=8,  # Reduced to prevent overfitting
        min_samples_split=50,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train, y_train)
    
    # Predictions
    rf_y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    rf_auc = roc_auc_score(y_test, rf_y_pred_proba)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    results_fixed[dataset_name] = {
        'AUC': rf_auc,
        'Top Features': feature_importance.head(5)['feature'].tolist(),
        'Feature Importances': feature_importance.head(10),
        'Sample Size': len(X),
        'Conversion Rate': y.mean()
    }
    
    print(f"  ‚úì Fixed Random Forest AUC: {rf_auc:.3f}")
    print(f"  ‚úì Top 3 features: {feature_importance.head(3)['feature'].tolist()}")

# Performance Comparison After Fix
print("\n" + "="*80)
print("REALISTIC PERFORMANCE COMPARISON (NO LEAKAGE)")
print("="*80)

comparison_fixed = []
for dataset_name, result in results_fixed.items():
    comparison_fixed.append({
        'Aggregation Strategy': dataset_name,
        'Samples': f"{result['Sample Size']:,}",
        'Conv Rate': f"{result['Conversion Rate']:.1%}",
        'Realistic AUC': f"{result['AUC']:.3f}",
        'Top Feature': result['Top Features'][0] if result['Top Features'] else 'N/A'
    })

comparison_fixed_df = pd.DataFrame(comparison_fixed)
comparison_fixed_df = comparison_fixed_df.sort_values('Realistic AUC', ascending=False)

print("\nüìà REALISTIC PERFORMANCE RANKING:")
for i, row in comparison_fixed_df.iterrows():
    print(f"{i+1}. {row['Aggregation Strategy']}:")
    print(f"   Conversion Rate: {row['Conv Rate']}")
    print(f"   Realistic AUC: {row['Realistic AUC']}")
    print(f"   Top Feature: {row['Top Feature']}")
    print()

# Find best realistic strategy
best_realistic_strategy = comparison_fixed_df.iloc[0]['Aggregation Strategy']
best_realistic_auc = float(comparison_fixed_df.iloc[0]['Realistic AUC'].replace('%', ''))

print("\n" + "="*80)
print("BUSINESS INSIGHTS")
print("="*80)

print(f"\nüèÜ REAL BEST STRATEGY: {best_realistic_strategy}")
print(f"   Realistic AUC: {best_realistic_auc:.3f}")
print(f"   (Previous inflated AUC: 0.982 ‚Üí Real AUC: {best_realistic_auc:.3f})")

print(f"\nüîç REAL TOP CONVERSION DRIVERS:")
if best_realistic_strategy in results_fixed:
    top_features = results_fixed[best_realistic_strategy]['Top Features']
    for i, feature in enumerate(top_features[:5], 1):
        print(f"   {i}. {feature}")

# Save the realistic model
if best_realistic_strategy in datasets_fixed:
    X, y = datasets_fixed[best_realistic_strategy]
    
    # Train final model on full data
    final_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        min_samples_split=50,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)
    
    # Save model and feature names
    model_data = {
        'model': final_model,
        'feature_names': X.columns.tolist(),
        'aggregation_strategy': best_realistic_strategy,
        'realistic_auc': best_realistic_auc
    }
    
    with open('realistic_conversion_model.pkl', 'wb') as f:
        pickle.dump(model_data, f)
    
    print(f"‚úì Realistic model saved for {best_realistic_strategy}")
    print(f"‚úì Realistic AUC: {best_realistic_auc:.3f}")
    print(f"‚úì Features used: {len(X.columns)}")

