In [None]:
# ============================================================================
# 🎯 ADVANCED POST-PROCESSING PIPELINE
# ============================================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("🎯 SIMPLIFIED POST-PROCESSING PIPELINE")
print("=" * 50)

# ============================================================================
# 1️⃣ LOAD AND VALIDATE INPUT DATA
# ============================================================================
print("1️⃣ LOADING AND VALIDATING INPUT DATA")
print("-" * 40)

try:
    # Load training data for reference stats
    train = pd.read_csv('/content/drive/MyDrive/train_cleaned.csv')
    print("✅ Training data loaded successfully")
    print(f"   Training shape: {train.shape}")
except Exception as e:
    print(f"❌ Error loading training data: {e}")
    print("⚠️  Using default price range [1, 100]")
    train_price_min, train_price_max = 1, 100
else:
    train_price = train['price']
    train_price_min = train_price.min()
    train_price_max = train_price.max()
    print(f"   Training price range: ${train_price_min:.2f} - ${train_price_max:.2f}")

try:
    # Load submission file
    sub = pd.read_csv('safe_bert_ensemble_submission.csv')
    print("✅ Submission data loaded successfully")
    print(f"   Submission shape: {sub.shape}")
except Exception as e:
    print(f"❌ Error loading submission: {e}")
    print("🔄 Creating dummy submission for testing...")
    sub = pd.DataFrame({
        'sample_id': range(75000),
        'price': np.random.uniform(10, 50, 75000)
    })

# Validate submission data
print("\n🔍 VALIDATING SUBMISSION DATA:")
print(f"   Total samples: {len(sub):,}")
print(f"   Missing values: {sub['price'].isna().sum()}")
print(f"   Infinite values: {np.isinf(sub['price']).sum()}")
print(f"   Zero/negative values: {(sub['price'] <= 0).sum()}")

# Fix any issues in the submission
original_price = sub['price'].copy()

# Handle missing values
if sub['price'].isna().sum() > 0:
    print("⚠️  Fixing missing values...")
    median_price = sub['price'].median()
    sub['price'] = sub['price'].fillna(median_price)

# Handle infinite values
if np.isinf(sub['price']).sum() > 0:
    print("⚠️  Fixing infinite values...")
    sub['price'] = np.nan_to_num(sub['price'], nan=10.0, posinf=100.0, neginf=1.0)

# Handle zero/negative values
if (sub['price'] <= 0).sum() > 0:
    print("⚠️  Fixing zero/negative values...")
    sub['price'] = np.clip(sub['price'], 0.01, None)

print(f"\n📊 ORIGINAL SUBMISSION STATS:")
print(f"   Min: ${sub['price'].min():.2f}")
print(f"   Max: ${sub['price'].max():.2f}")
print(f"   Mean: ${sub['price'].mean():.2f}")
print(f"   Median: ${sub['price'].median():.2f}")

# ============================================================================
# 2️⃣ SIMPLE PERCENTILE-BASED CLIPPING
# ============================================================================
print("\n2️⃣ APPLYING PERCENTILE-BASED CLIPPING")
print("-" * 40)

# Calculate percentiles from training data
try:
    lower_bound = np.percentile(train_price, 0.1)  # 0.1th percentile
    upper_bound = np.percentile(train_price, 99.5)  # 99.5th percentile

    # Ensure reasonable bounds
    lower_bound = max(0.1, lower_bound * 0.8)  # Slightly below training min
    upper_bound = upper_bound * 1.1  # Slightly above training max

    print(f"   Training 0.1th percentile: ${np.percentile(train_price, 0.1):.2f}")
    print(f"   Training 99.5th percentile: ${np.percentile(train_price, 99.5):.2f}")
    print(f"   Final clipping bounds: [${lower_bound:.2f}, ${upper_bound:.2f}]")

except Exception as e:
    print(f"⚠️  Error calculating percentiles: {e}")
    print("🔄 Using safe default bounds [1, 100]")
    lower_bound = 1.0
    upper_bound = 100.0

# Apply clipping
sub_clipped = sub.copy()
sub_clipped['price'] = sub_clipped['price'].clip(lower=lower_bound, upper=upper_bound)

print(f"\n📊 AFTER CLIPPING STATS:")
print(f"   Min: ${sub_clipped['price'].min():.2f}")
print(f"   Max: ${sub_clipped['price'].max():.2f}")
print(f"   Mean: ${sub_clipped['price'].mean():.2f}")
print(f"   Median: ${sub_clipped['price'].median():.2f}")

# ============================================================================
# 3️⃣ BASIC DISTRIBUTION ALIGNMENT
# ============================================================================
print("\n3️⃣ APPLYING BASIC DISTRIBUTION ALIGNMENT")
print("-" * 40)

try:
    # Simple mean alignment (most important correction)
    train_mean = train_price.mean()
    current_mean = sub_clipped['price'].mean()

    if abs(current_mean - train_mean) > 0.1:  # Only adjust if significant difference
        correction_factor = train_mean / current_mean
        print(f"   Mean correction factor: {correction_factor:.3f}")

        sub_aligned = sub_clipped.copy()
        sub_aligned['price'] = sub_clipped['price'] * correction_factor

        # Re-clip after alignment
        sub_aligned['price'] = sub_aligned['price'].clip(lower=lower_bound, upper=upper_bound)

        print(f"   After mean alignment: ${sub_aligned['price'].mean():.2f}")
    else:
        print("   No mean alignment needed (close enough)")
        sub_aligned = sub_clipped.copy()

except Exception as e:
    print(f"⚠️  Mean alignment failed: {e}")
    sub_aligned = sub_clipped.copy()

# ============================================================================
# 4️⃣ SMART ROUNDING
# ============================================================================
print("\n4️⃣ APPLYING SMART ROUNDING")
print("-" * 40)

def smart_round_prices(prices):
    """Round prices based on common retail pricing patterns"""
    rounded = []
    for price in prices:
        if price < 1:
            # Very small prices: 2 decimal places
            rounded.append(round(price, 2))
        elif price < 10:
            # Small prices: 2 decimal places (e.g., $4.99)
            rounded.append(round(price, 2))
        elif price < 100:
            # Medium prices: 1 decimal place (e.g., $29.90)
            rounded.append(round(price, 1))
        else:
            # Large prices: whole dollars (e.g., $150)
            rounded.append(round(price, 0))
    return np.array(rounded)

sub_final = sub_aligned.copy()
sub_final['price'] = smart_round_prices(sub_aligned['price'].values)

print(f"📊 AFTER ROUNDING STATS:")
print(f"   Min: ${sub_final['price'].min():.2f}")
print(f"   Max: ${sub_final['price'].max():.2f}")
print(f"   Mean: ${sub_final['price'].mean():.2f}")
print(f"   Median: ${sub_final['price'].median():.2f}")

# ============================================================================
# 5️⃣ FINAL VALIDATION
# ============================================================================
print("\n5️⃣ FINAL VALIDATION")
print("-" * 40)

# Check for any remaining issues
final_prices = sub_final['price']

issues_found = 0
if final_prices.isna().sum() > 0:
    print(f"❌ Still has {final_prices.isna().sum()} missing values")
    issues_found += 1

if (final_prices <= 0).sum() > 0:
    print(f"❌ Still has {(final_prices <= 0).sum()} zero/negative values")
    issues_found += 1

if np.isinf(final_prices).sum() > 0:
    print(f"❌ Still has {np.isinf(final_prices).sum()} infinite values")
    issues_found += 1

if len(final_prices) != 75000:
    print(f"❌ Wrong number of samples: {len(final_prices)} instead of 75,000")
    issues_found += 1

if issues_found == 0:
    print("✅ All validation checks passed!")
else:
    print(f"⚠️  {issues_found} issues found, applying emergency fixes...")
    # Emergency fix: use clipped version as fallback
    sub_final = sub_clipped.copy()
    sub_final['price'] = smart_round_prices(sub_clipped['price'].values)

# Compare with training distribution
try:
    print(f"\n📈 DISTRIBUTION COMPARISON:")
    print(f"   Training mean: ${train_price.mean():.2f}")
    print(f"   Final mean:    ${sub_final['price'].mean():.2f}")
    print(f"   Difference:    ${sub_final['price'].mean() - train_price.mean():.3f}")

    print(f"   Training median: ${train_price.median():.2f}")
    print(f"   Final median:    ${sub_final['price'].median():.2f}")
    print(f"   Difference:      ${sub_final['price'].median() - train_price.median():.3f}")
except:
    print("   (Training comparison skipped)")

# ============================================================================
# 6️⃣ SAVE FINAL SUBMISSION
# ============================================================================
print("\n6️⃣ SAVING FINAL SUBMISSION")
print("-" * 40)

# Save multiple versions for testing
sub_clipped.to_csv('submission_simple_clipped.csv', index=False)
print("✅ Saved: submission_simple_clipped.csv")

sub_final.to_csv('safe_bert_ensemble_submission_postprocessed.csv', index=False)
print("✅ Saved: safe_bert_ensemble_submission_postprocessed.csv")

print(f"\n🎯 FINAL SUBMISSION READY!")
print(f"   Samples: {len(sub_final):,}")
print(f"   Price range: ${sub_final['price'].min():.2f} - ${sub_final['price'].max():.2f}")
print(f"   Mean: ${sub_final['price'].mean():.2f}")
print(f"   Valid prices: {(sub_final['price'] > 0).sum()}/{len(sub_final)}")

print("\n" + "=" * 50)
print("✅ POST-PROCESSING COMPLETE - SAFE & RELIABLE!")
print("=" * 50)

# ============================================================================
# 7️⃣ QUICK COMPARISON REPORT
# ============================================================================
print("\n📊 COMPARISON REPORT")
print("-" * 40)

print("Method                | Min      | Max      | Mean     | Median")
print("-" * 60)
print(f"{'Original':20} | ${sub['price'].min():7.2f} | ${sub['price'].max():7.2f} | ${sub['price'].mean():7.2f} | ${sub['price'].median():7.2f}")
print(f"{'Clipped':20} | ${sub_clipped['price'].min():7.2f} | ${sub_clipped['price'].max():7.2f} | ${sub_clipped['price'].mean():7.2f} | ${sub_clipped['price'].median():7.2f}")
print(f"{'Final':20} | ${sub_final['price'].min():7.2f} | ${sub_final['price'].max():7.2f} | ${sub_final['price'].mean():7.2f} | ${sub_final['price'].median():7.2f}")

try:
    print(f"{'Training':20} | ${train_price.min():7.2f} | ${train_price.max():7.2f} | ${train_price.mean():7.2f} | ${train_price.median():7.2f}")
except:
    pass

print(f"\n💡 Recommendation: Test both files on leaderboard:")
print(f"   - submission_simple_clipped.csv (just clipping)")
print(f"   - safe_bert_ensemble_submission_postprocessed.csv (full processing)")