# Auto-Regressive Fair Market Price Model

Build a pricing model that predicts fair market value using:
- **Historical sales** (auto-regressive features)
- **Related cards** (same set, rarity, treatment)
- **Liquidity metrics** (sales velocity, days since last sale)
- **Order book depth** (active listings, spread)
- **Treatment multipliers** (learned from data)

## Model Architecture

```
FMP(card, treatment, t) = 
    α₀ + 
    α₁ × price(t-1) +           # AR(1) - last sale
    α₂ × price(t-7d) +          # Weekly lag
    α₃ × floor_4_lowest +       # Current floor
    β₁ × treatment_mult +       # Treatment effect
    β₂ × rarity_mult +          # Rarity effect
    γ₁ × liquidity_score +      # How often it sells
    γ₂ × depth_score +          # Order book depth
    δ₁ × related_cards_avg      # Similar cards' prices
```

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from sqlalchemy import text
from sqlmodel import Session
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

from app.db import engine

plt.style.use('dark_background')
print("Libraries loaded")

## 1. Data Collection

Fetch all sales with card metadata, treatment info, and temporal features.

In [None]:
def fetch_sales_data(days: int = 180) -> pd.DataFrame:
    """Fetch all sales with card metadata."""
    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
    
    query = text("""
        SELECT 
            mp.id as sale_id,
            mp.card_id,
            c.name as card_name,
            c.card_number,
            r.name as rarity,
            c.set_name,
            COALESCE(NULLIF(mp.product_subtype, ''), mp.treatment) as treatment,
            mp.price,
            COALESCE(mp.sold_date, mp.scraped_at) as sale_date,
            mp.platform,
            mp.is_bulk_lot
        FROM marketprice mp
        JOIN card c ON c.id = mp.card_id
        LEFT JOIN rarity r ON r.id = c.rarity_id
        WHERE mp.listing_type = 'sold'
          AND mp.is_bulk_lot = FALSE
          AND COALESCE(mp.sold_date, mp.scraped_at) >= :cutoff
          AND mp.price > 0
          AND mp.price < 10000  -- Filter obvious errors
        ORDER BY mp.card_id, sale_date
    """)
    
    with Session(engine) as session:
        result = session.execute(query, {"cutoff": cutoff})
        rows = result.fetchall()
        
    df = pd.DataFrame(rows, columns=[
        'sale_id', 'card_id', 'card_name', 'card_number', 'rarity', 
        'set_name', 'treatment', 'price', 'sale_date', 'platform', 'is_bulk_lot'
    ])
    
    df['sale_date'] = pd.to_datetime(df['sale_date'], utc=True)
    df['treatment'] = df['treatment'].fillna('Unknown')
    df['rarity'] = df['rarity'].fillna('Unknown')
    
    return df

sales_df = fetch_sales_data(days=180)
print(f"Loaded {len(sales_df):,} sales")
print(f"Cards: {sales_df['card_id'].nunique()}")
print(f"Date range: {sales_df['sale_date'].min()} to {sales_df['sale_date'].max()}")
sales_df.head()

In [None]:
def fetch_active_listings() -> pd.DataFrame:
    """Fetch current active listings for order book depth."""
    cutoff = datetime.now(timezone.utc) - timedelta(days=30)
    
    query = text("""
        SELECT 
            card_id,
            COALESCE(NULLIF(product_subtype, ''), treatment) as treatment,
            COUNT(*) as listing_count,
            MIN(price) as lowest_ask,
            MAX(price) as highest_ask,
            AVG(price) as avg_ask,
            PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY price) as median_ask
        FROM marketprice
        WHERE listing_type = 'active'
          AND is_bulk_lot = FALSE
          AND scraped_at >= :cutoff
        GROUP BY card_id, COALESCE(NULLIF(product_subtype, ''), treatment)
    """)
    
    with Session(engine) as session:
        result = session.execute(query, {"cutoff": cutoff})
        rows = result.fetchall()
        
    df = pd.DataFrame(rows, columns=[
        'card_id', 'treatment', 'listing_count', 'lowest_ask', 
        'highest_ask', 'avg_ask', 'median_ask'
    ])
    df['treatment'] = df['treatment'].fillna('Unknown')
    
    # Calculate spread
    df['ask_spread'] = (df['highest_ask'] - df['lowest_ask']) / df['lowest_ask'].replace(0, np.nan)
    df['ask_spread'] = df['ask_spread'].fillna(0)
    
    return df

listings_df = fetch_active_listings()
print(f"Loaded {len(listings_df):,} card/treatment combinations with active listings")
listings_df.head()

## 2. Feature Engineering

Build features for each sale that capture:
- Auto-regressive features (past prices)
- Treatment/rarity effects
- Liquidity metrics
- Related card prices

In [None]:
def compute_ar_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add auto-regressive features: lagged prices and rolling stats."""
    df = df.sort_values(['card_id', 'treatment', 'sale_date']).copy()
    
    # Group by card + treatment
    group_cols = ['card_id', 'treatment']
    
    # Lag features (previous sales)
    df['price_lag1'] = df.groupby(group_cols)['price'].shift(1)
    df['price_lag2'] = df.groupby(group_cols)['price'].shift(2)
    df['price_lag3'] = df.groupby(group_cols)['price'].shift(3)
    
    # Rolling statistics (last N sales)
    df['price_roll_mean_5'] = df.groupby(group_cols)['price'].transform(
        lambda x: x.shift(1).rolling(5, min_periods=1).mean()
    )
    df['price_roll_std_5'] = df.groupby(group_cols)['price'].transform(
        lambda x: x.shift(1).rolling(5, min_periods=2).std()
    )
    df['price_roll_min_4'] = df.groupby(group_cols)['price'].transform(
        lambda x: x.shift(1).rolling(4, min_periods=1).min()
    )  # Floor-like feature
    
    # Price momentum (change from previous sale)
    df['price_momentum'] = df['price_lag1'] - df['price_lag2']
    df['price_pct_change'] = (df['price_lag1'] - df['price_lag2']) / df['price_lag2'].replace(0, np.nan)
    
    # Sale sequence number (experience with this card/treatment)
    df['sale_seq'] = df.groupby(group_cols).cumcount()
    
    return df

sales_df = compute_ar_features(sales_df)
print(f"AR features added")
sales_df[['card_name', 'treatment', 'price', 'price_lag1', 'price_roll_mean_5', 'price_roll_min_4']].head(10)

In [None]:
def compute_liquidity_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add liquidity metrics: sales velocity, time gaps."""
    df = df.sort_values(['card_id', 'treatment', 'sale_date']).copy()
    group_cols = ['card_id', 'treatment']
    
    # Days since previous sale
    df['prev_sale_date'] = df.groupby(group_cols)['sale_date'].shift(1)
    df['days_since_last_sale'] = (df['sale_date'] - df['prev_sale_date']).dt.total_seconds() / 86400
    
    # Sales count in last 30 days (proxy for liquidity)
    def count_recent_sales(group):
        result = []
        dates = group['sale_date'].values
        for i, date in enumerate(dates):
            cutoff = date - np.timedelta64(30, 'D')
            count = np.sum((dates[:i] >= cutoff) & (dates[:i] < date))
            result.append(count)
        return pd.Series(result, index=group.index)
    
    df['sales_last_30d'] = df.groupby(group_cols, group_keys=False).apply(count_recent_sales)
    
    # Card-level total sales (popularity)
    card_sales_count = df.groupby('card_id').size().rename('card_total_sales')
    df = df.merge(card_sales_count, on='card_id', how='left')
    
    return df

sales_df = compute_liquidity_features(sales_df)
print(f"Liquidity features added")
sales_df[['card_name', 'treatment', 'price', 'days_since_last_sale', 'sales_last_30d', 'card_total_sales']].head(10)

In [None]:
def compute_related_card_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add features from related cards (same rarity, same set)."""
    
    # Compute rarity-level price stats (excluding current card)
    rarity_stats = df.groupby(['rarity', 'treatment']).agg({
        'price': ['mean', 'median', 'std', 'count']
    }).reset_index()
    rarity_stats.columns = ['rarity', 'treatment', 'rarity_mean', 'rarity_median', 'rarity_std', 'rarity_count']
    
    df = df.merge(rarity_stats, on=['rarity', 'treatment'], how='left')
    
    # Price relative to rarity average
    df['price_vs_rarity'] = df['price'] / df['rarity_mean'].replace(0, np.nan)
    
    # Set-level stats
    set_stats = df.groupby('set_name').agg({
        'price': ['mean', 'median']
    }).reset_index()
    set_stats.columns = ['set_name', 'set_mean', 'set_median']
    
    df = df.merge(set_stats, on='set_name', how='left')
    
    return df

sales_df = compute_related_card_features(sales_df)
print(f"Related card features added")
sales_df[['card_name', 'rarity', 'price', 'rarity_mean', 'price_vs_rarity']].head(10)

In [None]:
def add_order_book_features(sales_df: pd.DataFrame, listings_df: pd.DataFrame) -> pd.DataFrame:
    """Merge order book depth features."""
    
    # Merge on card_id + treatment
    sales_df = sales_df.merge(
        listings_df[['card_id', 'treatment', 'listing_count', 'lowest_ask', 'avg_ask', 'ask_spread']],
        on=['card_id', 'treatment'],
        how='left'
    )
    
    # Fill missing (no active listings)
    sales_df['listing_count'] = sales_df['listing_count'].fillna(0)
    sales_df['lowest_ask'] = sales_df['lowest_ask'].fillna(sales_df['price_lag1'])
    sales_df['avg_ask'] = sales_df['avg_ask'].fillna(sales_df['price_lag1'])
    sales_df['ask_spread'] = sales_df['ask_spread'].fillna(0)
    
    # Price vs lowest ask (deal indicator)
    sales_df['price_vs_ask'] = sales_df['price'] / sales_df['lowest_ask'].replace(0, np.nan)
    
    return sales_df

sales_df = add_order_book_features(sales_df, listings_df)
print(f"Order book features added")
sales_df[['card_name', 'treatment', 'price', 'listing_count', 'lowest_ask', 'ask_spread']].head(10)

In [None]:
def encode_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    """Encode treatment and rarity as numeric features."""
    
    # Treatment encoding (ordinal based on typical value)
    treatment_order = {
        'Classic Paper': 1,
        'Classic Foil': 2,
        'Stonefoil': 3,
        'Serialized': 4,
        'Unknown': 0,
    }
    df['treatment_code'] = df['treatment'].map(treatment_order).fillna(0)
    
    # Also create treatment multiplier (learned from data)
    treatment_avg = df.groupby('treatment')['price'].mean()
    base_price = treatment_avg.get('Classic Paper', treatment_avg.mean())
    treatment_mult = (treatment_avg / base_price).to_dict()
    df['treatment_mult'] = df['treatment'].map(treatment_mult).fillna(1.0)
    
    # Rarity encoding
    rarity_order = {
        'Common': 1,
        'Uncommon': 2,
        'Rare': 3,
        'Epic': 4,
        'Legendary': 5,
        'Mythic': 6,
        'Unknown': 0,
    }
    df['rarity_code'] = df['rarity'].map(rarity_order).fillna(0)
    
    # Rarity multiplier
    rarity_avg = df.groupby('rarity')['price'].mean()
    base_rarity = rarity_avg.get('Common', rarity_avg.mean())
    rarity_mult = (rarity_avg / base_rarity).to_dict()
    df['rarity_mult'] = df['rarity'].map(rarity_mult).fillna(1.0)
    
    return df

sales_df = encode_categorical_features(sales_df)
print(f"Categorical features encoded")
print(f"\nTreatment multipliers:")
print(sales_df.groupby('treatment')['treatment_mult'].first().sort_values())
print(f"\nRarity multipliers:")
print(sales_df.groupby('rarity')['rarity_mult'].first().sort_values())

In [None]:
def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add time-based features."""
    
    df['day_of_week'] = df['sale_date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['hour'] = df['sale_date'].dt.hour
    df['days_since_start'] = (df['sale_date'] - df['sale_date'].min()).dt.days
    
    return df

sales_df = add_temporal_features(sales_df)
print(f"Temporal features added")
print(f"\nFinal feature count: {len(sales_df.columns)}")
print(f"Columns: {list(sales_df.columns)}")

## 3. Prepare Training Data

Filter to rows with sufficient history and split train/test.

In [None]:
# Feature columns for model
FEATURE_COLS = [
    # AR features
    'price_lag1', 'price_lag2', 'price_lag3',
    'price_roll_mean_5', 'price_roll_std_5', 'price_roll_min_4',
    'price_momentum', 'price_pct_change',
    
    # Liquidity
    'days_since_last_sale', 'sales_last_30d', 'card_total_sales', 'sale_seq',
    
    # Related cards
    'rarity_mean', 'rarity_median', 'set_mean',
    
    # Order book
    'listing_count', 'lowest_ask', 'avg_ask', 'ask_spread',
    
    # Categorical
    'treatment_code', 'treatment_mult', 'rarity_code', 'rarity_mult',
    
    # Temporal
    'day_of_week', 'is_weekend',
]

TARGET_COL = 'price'

# Filter rows with at least one previous sale (need lag features)
model_df = sales_df[sales_df['sale_seq'] >= 1].copy()
print(f"Rows with history: {len(model_df):,} (dropped {len(sales_df) - len(model_df):,} first sales)")

# Drop rows with NaN in features
model_df = model_df.dropna(subset=FEATURE_COLS + [TARGET_COL])
print(f"After dropping NaN: {len(model_df):,}")

# Prepare X and y
X = model_df[FEATURE_COLS].values
y = model_df[TARGET_COL].values

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Time-based split (use recent data for testing)
# Sort by date and use last 20% as test
model_df = model_df.sort_values('sale_date')
split_idx = int(len(model_df) * 0.8)

train_df = model_df.iloc[:split_idx]
test_df = model_df.iloc[split_idx:]

X_train = train_df[FEATURE_COLS].values
y_train = train_df[TARGET_COL].values
X_test = test_df[FEATURE_COLS].values
y_test = test_df[TARGET_COL].values

print(f"Train: {len(train_df):,} samples ({train_df['sale_date'].min()} to {train_df['sale_date'].max()})")
print(f"Test:  {len(test_df):,} samples ({test_df['sale_date'].min()} to {test_df['sale_date'].max()})")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Train Models

Compare multiple model types.

In [None]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    """Train and evaluate a model."""
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    results = {
        'Model': name,
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'Train R²': r2_score(y_train, y_pred_train),
        'Test R²': r2_score(y_test, y_pred_test),
    }
    
    return results, model, y_pred_test

models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
}

results = []
trained_models = {}
predictions = {}

for name, model in models.items():
    print(f"Training {name}...")
    X_tr = X_train_scaled if 'Ridge' in name or 'Lasso' in name else X_train
    X_te = X_test_scaled if 'Ridge' in name or 'Lasso' in name else X_test
    
    res, trained_model, y_pred = evaluate_model(name, model, X_tr, y_train, X_te, y_test)
    results.append(res)
    trained_models[name] = trained_model
    predictions[name] = y_pred

results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(results_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MAE comparison
ax1 = axes[0]
x = range(len(results_df))
width = 0.35
ax1.bar([i - width/2 for i in x], results_df['Train MAE'], width, label='Train', alpha=0.8)
ax1.bar([i + width/2 for i in x], results_df['Test MAE'], width, label='Test', alpha=0.8)
ax1.set_xticks(x)
ax1.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax1.set_ylabel('MAE ($)')
ax1.set_title('Mean Absolute Error by Model')
ax1.legend()

# R² comparison
ax2 = axes[1]
ax2.bar([i - width/2 for i in x], results_df['Train R²'], width, label='Train', alpha=0.8)
ax2.bar([i + width/2 for i in x], results_df['Test R²'], width, label='Test', alpha=0.8)
ax2.set_xticks(x)
ax2.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax2.set_ylabel('R² Score')
ax2.set_title('R² Score by Model')
ax2.legend()

plt.tight_layout()
plt.show()

## 5. Feature Importance

Which features matter most for price prediction?

In [None]:
# Get feature importance from best tree-based model
best_model = trained_models.get('Gradient Boosting') or trained_models.get('Random Forest')

if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': FEATURE_COLS,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(importance_df['Feature'], importance_df['Importance'], color='#4ecdc4')
    ax.set_xlabel('Importance')
    ax.set_title('Feature Importance (Gradient Boosting)')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Features:")
    print(importance_df.tail(10).to_string(index=False))

## 6. Compare to Current Floor Algorithm

How does the ML model compare to simple avg-of-4-lowest?

In [None]:
# Baseline: Use price_roll_min_4 (rolling min of last 4) as prediction
baseline_pred = test_df['price_roll_min_4'].values

# Also try price_lag1 as naive baseline
naive_pred = test_df['price_lag1'].values

# Best ML model
best_model_name = results_df.loc[results_df['Test MAE'].idxmin(), 'Model']
ml_pred = predictions[best_model_name]

comparison = {
    'Naive (Last Sale)': {
        'MAE': mean_absolute_error(y_test, naive_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, naive_pred)),
        'R²': r2_score(y_test, naive_pred),
    },
    'Floor (Min of 4)': {
        'MAE': mean_absolute_error(y_test, baseline_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, baseline_pred)),
        'R²': r2_score(y_test, baseline_pred),
    },
    f'ML ({best_model_name})': {
        'MAE': mean_absolute_error(y_test, ml_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, ml_pred)),
        'R²': r2_score(y_test, ml_pred),
    },
}

comp_df = pd.DataFrame(comparison).T
print("="*60)
print("COMPARISON: ML vs Current Floor Algorithm")
print("="*60)
print(comp_df.round(2).to_string())

# Calculate improvement
floor_mae = comparison['Floor (Min of 4)']['MAE']
ml_mae = comparison[f'ML ({best_model_name})']['MAE']
improvement = (floor_mae - ml_mae) / floor_mae * 100

print(f"\nML improves MAE by {improvement:.1f}% over floor algorithm")

In [None]:
# Scatter plot: Predicted vs Actual
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, (name, pred) in zip(axes, [('Naive', naive_pred), ('Floor', baseline_pred), (f'ML', ml_pred)]):
    ax.scatter(y_test, pred, alpha=0.3, s=10)
    ax.plot([0, y_test.max()], [0, y_test.max()], 'r--', linewidth=2)
    ax.set_xlabel('Actual Price')
    ax.set_ylabel('Predicted Price')
    ax.set_title(f'{name}')
    ax.set_xlim(0, np.percentile(y_test, 95))
    ax.set_ylim(0, np.percentile(y_test, 95))

plt.tight_layout()
plt.show()

## 7. Error Analysis by Category

In [None]:
# Add predictions to test dataframe
test_df = test_df.copy()
test_df['ml_pred'] = ml_pred
test_df['ml_error'] = test_df['ml_pred'] - test_df['price']
test_df['ml_abs_error'] = np.abs(test_df['ml_error'])
test_df['ml_pct_error'] = test_df['ml_abs_error'] / test_df['price'] * 100

# Error by treatment
print("Error by Treatment:")
treatment_errors = test_df.groupby('treatment').agg({
    'ml_abs_error': ['mean', 'median'],
    'ml_pct_error': 'median',
    'price': 'count'
}).round(2)
treatment_errors.columns = ['MAE', 'Median AE', 'Median %Err', 'Count']
print(treatment_errors.to_string())

print("\nError by Rarity:")
rarity_errors = test_df.groupby('rarity').agg({
    'ml_abs_error': ['mean', 'median'],
    'ml_pct_error': 'median',
    'price': 'count'
}).round(2)
rarity_errors.columns = ['MAE', 'Median AE', 'Median %Err', 'Count']
print(rarity_errors.to_string())

In [None]:
# Error by price range
test_df['price_bucket'] = pd.cut(
    test_df['price'], 
    bins=[0, 10, 25, 50, 100, float('inf')],
    labels=['$0-10', '$10-25', '$25-50', '$50-100', '$100+']
)

print("Error by Price Range:")
price_errors = test_df.groupby('price_bucket', observed=True).agg({
    'ml_abs_error': ['mean', 'median'],
    'ml_pct_error': 'median',
    'price': 'count'
}).round(2)
price_errors.columns = ['MAE', 'Median AE', 'Median %Err', 'Count']
print(price_errors.to_string())

## 8. Save Model for Production

In [None]:
import joblib
from pathlib import Path

# Save best model
model_dir = Path('../models')
model_dir.mkdir(exist_ok=True)

best_model = trained_models[best_model_name]
model_path = model_dir / 'ar_pricing_model.joblib'
scaler_path = model_dir / 'ar_pricing_scaler.joblib'

joblib.dump(best_model, model_path)
joblib.dump(scaler, scaler_path)

# Save feature list
feature_path = model_dir / 'ar_pricing_features.txt'
with open(feature_path, 'w') as f:
    f.write('\n'.join(FEATURE_COLS))

print(f"Model saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")
print(f"Features saved to: {feature_path}")

## 9. Summary & Recommendations

In [None]:
print("="*70)
print("AUTO-REGRESSIVE PRICING MODEL SUMMARY")
print("="*70)

print(f"\n1. DATA")
print(f"   - Training samples: {len(train_df):,}")
print(f"   - Test samples: {len(test_df):,}")
print(f"   - Features: {len(FEATURE_COLS)}")

print(f"\n2. BEST MODEL: {best_model_name}")
print(f"   - Test MAE: ${results_df[results_df['Model']==best_model_name]['Test MAE'].values[0]:.2f}")
print(f"   - Test R²: {results_df[results_df['Model']==best_model_name]['Test R²'].values[0]:.3f}")

print(f"\n3. VS CURRENT FLOOR ALGORITHM")
print(f"   - Floor MAE: ${floor_mae:.2f}")
print(f"   - ML MAE: ${ml_mae:.2f}")
print(f"   - Improvement: {improvement:.1f}%")

print(f"\n4. TOP FEATURES")
if hasattr(best_model, 'feature_importances_'):
    top_features = importance_df.tail(5)['Feature'].tolist()[::-1]
    for i, f in enumerate(top_features, 1):
        print(f"   {i}. {f}")

print(f"\n5. RECOMMENDATIONS")
if improvement > 10:
    print("   ✅ ML model significantly outperforms floor algorithm")
    print("   ✅ Consider deploying AR model for FMP calculation")
else:
    print("   ⚠️ ML model only marginally better than floor algorithm")
    print("   ⚠️ Floor algorithm may be sufficient for most use cases")

print("   - Use treatment/rarity multipliers from training data")
print("   - Retrain weekly with new sales data")
print("   - Monitor for drift in prediction accuracy")