In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.linear_model import Ridge, ElasticNet
import xgboost as xgb
import lightgbm as lgb
import torch
from transformers import AutoTokenizer, AutoModel
import gc
import warnings
warnings.filterwarnings('ignore')

print("🚀 SAFE BERT-ENHANCED PIPELINE - RAM OPTIMIZED")
print("=" * 60)

# GPU check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Device: {device}")

# ============================================================================
# 1️⃣ MEMORY-OPTIMIZED DATA LOADING
# ============================================================================
print("\n📥 Loading data with memory optimization...")

def optimize_dtypes(df):
    """Aggressive memory optimization"""
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Load only necessary columns
train = pd.read_csv('/content/drive/MyDrive/train_cleaned.csv')
test = pd.read_csv('/content/drive/MyDrive/test_cleaned.csv')

# Save test IDs before optimization
test_ids = test['sample_id'].values.copy()

# Optimize memory
train = optimize_dtypes(train)
test = optimize_dtypes(test)

print(f"✅ Train: {train.shape}, Test: {test.shape}")
print(f"✅ Memory: Train={train.memory_usage(deep=True).sum() / 1024**2:.1f}MB")

# ============================================================================
# 2️⃣ SAFE BERT EMBEDDINGS (MEMORY CONTROLLED)
# ============================================================================
print("\n🧠 Extracting BERT embeddings (safe mode)...")

text_col = 'catalog_content_clean'

class SafeBERTEmbedder:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = device

    def load(self):
        print(f"   Loading {self.model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
        self.model.eval()

    def embed(self, texts, batch_size=16, max_len=64):
        """Safe embedding with smart pooling"""
        if self.tokenizer is None:
            self.load()

        embeddings = []
        total = len(texts)

        for i in range(0, total, batch_size):
            batch = texts[i:i+batch_size].tolist()

            # Tokenize
            inputs = self.tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors='pt'
            ).to(self.device)

            # Generate embeddings with BETTER pooling strategy
            with torch.no_grad():
                outputs = self.model(**inputs)

                # Use CLS token (pooler_output) if available, else mean pooling
                if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                    emb = outputs.pooler_output.cpu().numpy()
                else:
                    # Attention-mask weighted mean pooling (better than simple mean)
                    attention_mask = inputs['attention_mask']
                    token_embeddings = outputs.last_hidden_state

                    # Expand attention mask for broadcasting
                    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

                    # Weighted mean
                    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                    emb = (sum_embeddings / sum_mask).cpu().numpy()

                embeddings.append(emb.astype(np.float32))

            # Cleanup
            del inputs, outputs
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            if (i // batch_size) % 20 == 0:
                print(f"      {min(i+batch_size, total)}/{total} processed")

            # Extra RAM safety: cleanup every 50 batches
            if (i // batch_size) % 50 == 0:
                gc.collect()

        return np.vstack(embeddings)

# Generate BERT embeddings
if text_col in train.columns:
    train_texts = train[text_col].fillna('').astype(str)
    test_texts = test[text_col].fillna('').astype(str)

    embedder = SafeBERTEmbedder()

    print("   Training set embeddings...")
    bert_train = embedder.embed(train_texts, batch_size=16)

    # RAM safety: process and reduce immediately
    print("   Reducing train dimensions...")
    pca = PCA(n_components=48, random_state=42)  # Increased from 32
    bert_train_reduced = pca.fit_transform(bert_train).astype(np.float32)

    # Free train embeddings before test
    del bert_train
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("   Test set embeddings...")
    bert_test = embedder.embed(test_texts, batch_size=16)

    print("   Reducing test dimensions...")
    bert_test_reduced = pca.transform(bert_test).astype(np.float32)

    print(f"✅ BERT: {bert_train_reduced.shape[1]} dims (var: {pca.explained_variance_ratio_.sum():.2%})")

    # Cleanup
    del bert_test, embedder
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
else:
    bert_train_reduced = np.zeros((len(train), 0), dtype=np.float32)
    bert_test_reduced = np.zeros((len(test), 0), dtype=np.float32)

# ============================================================================
# 3️⃣ ENHANCED TF-IDF FEATURES
# ============================================================================
print("\n📝 Extracting TF-IDF features...")

if text_col in train.columns:
    tfidf = TfidfVectorizer(
        max_features=100,  # Increased from 50
        stop_words='english',
        min_df=3,
        max_df=0.9,
        ngram_range=(1, 2),
        dtype=np.float32
    )

    tfidf_train = tfidf.fit_transform(train_texts)
    tfidf_test = tfidf.transform(test_texts)

    # Reduce dimensions (keep more variance)
    svd = TruncatedSVD(n_components=25, random_state=42)  # Increased from 15
    tfidf_train_reduced = svd.fit_transform(tfidf_train).astype(np.float32)
    tfidf_test_reduced = svd.transform(tfidf_test).astype(np.float32)

    print(f"✅ TF-IDF: {tfidf_train_reduced.shape[1]} dims (var: {svd.explained_variance_ratio_.sum():.2%})")

    del tfidf_train, tfidf_test
    gc.collect()
else:
    tfidf_train_reduced = np.zeros((len(train), 0), dtype=np.float32)
    tfidf_test_reduced = np.zeros((len(test), 0), dtype=np.float32)

# ============================================================================
# 4️⃣ BASIC NUMERIC FEATURES (SAFE)
# ============================================================================
print("\n🔢 Processing basic features...")

basic_cols = ['item_pack_qty', 'catalog_len']
available_cols = [c for c in basic_cols if c in train.columns]

if available_cols:
    train_basic = train[available_cols].fillna(0).astype(np.float32).values
    test_basic = test[available_cols].fillna(0).astype(np.float32).values

    # Safe transformations
    train_basic_log = np.log1p(np.abs(train_basic))
    test_basic_log = np.log1p(np.abs(test_basic))

    train_basic_sqrt = np.sqrt(np.abs(train_basic))
    test_basic_sqrt = np.sqrt(np.abs(test_basic))

    # Combine
    train_basic_all = np.hstack([train_basic, train_basic_log, train_basic_sqrt])
    test_basic_all = np.hstack([test_basic, test_basic_log, test_basic_sqrt])

    print(f"✅ Basic: {train_basic_all.shape[1]} dims")
else:
    train_basic_all = np.zeros((len(train), 0), dtype=np.float32)
    test_basic_all = np.zeros((len(test), 0), dtype=np.float32)

# ============================================================================
# 5️⃣ ENHANCED IMAGE FEATURES (EXPLICIT NaN HANDLING)
# ============================================================================
print("\n🖼️ Extracting image features...")

def extract_image_features(df):
    """Image URL features with explicit NaN handling"""
    features = []
    for url in df['image_link']:
        # Explicit NaN handling
        if pd.isna(url) or url == '' or str(url).lower() == 'nan':
            # Default features for missing URLs
            feat = [0] * 8
        else:
            url_str = str(url).lower()
            feat = [
                1 if url_str.startswith('https') else 0,
                1 if 'amazon' in url_str else 0,
                1 if 'cloudfront' in url_str else 0,
                1 if '.jpg' in url_str else 0,
                1 if '.png' in url_str else 0,
                len(url_str),
                url_str.count('/'),
                url_str.count('-'),
            ]
        features.append(feat)
    return np.array(features, dtype=np.float32)

train_image = extract_image_features(train)
test_image = extract_image_features(test)

print(f"✅ Image: {train_image.shape[1]} dims")

# ============================================================================
# 6️⃣ COMBINE ALL FEATURES
# ============================================================================
print("\n🧩 Combining features...")

X_train_raw = np.hstack([
    train_basic_all,
    bert_train_reduced,
    tfidf_train_reduced,
    train_image
])

X_test_raw = np.hstack([
    test_basic_all,
    bert_test_reduced,
    tfidf_test_reduced,
    test_image
])

print(f"✅ Total features: {X_train_raw.shape[1]}")

# Target
y = train['price'].values.astype(np.float32)
y_log = np.log1p(y)

# Cleanup
del train, test, train_basic_all, test_basic_all
del bert_train_reduced, bert_test_reduced
del tfidf_train_reduced, tfidf_test_reduced
del train_image, test_image
gc.collect()

# ============================================================================
# 7️⃣ TRAIN/VAL SPLIT
# ============================================================================
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_raw, y_log, test_size=0.2, random_state=42
)

print(f"\n📚 Train: {X_tr.shape[0]}, Val: {X_val.shape[0]}")

# ============================================================================
# 8️⃣ CRITICAL: SEPARATE SCALING FOR DIFFERENT MODELS
# ============================================================================
print("\n⚖️ Scaling features...")

# For tree models (XGB, LGB) - NO SCALING NEEDED
X_tr_tree = X_tr.copy()
X_val_tree = X_val.copy()
X_test_tree = X_test_raw.copy()

# For linear models (Ridge, ElasticNet) - MUST SCALE
scaler_linear = RobustScaler()  # RobustScaler is safer than StandardScaler
X_tr_linear = scaler_linear.fit_transform(X_tr).astype(np.float32)
X_val_linear = scaler_linear.transform(X_val).astype(np.float32)
X_test_linear = scaler_linear.transform(X_test_raw).astype(np.float32)

print("✅ Scaled separately for tree and linear models")

# ============================================================================
# 8.5️⃣ SMART FEATURE SELECTION (BOOST ACCURACY)
# ============================================================================
print("\n🎯 Feature selection for better performance...")

from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold

# Step 1: Remove zero-variance features
var_threshold = VarianceThreshold(threshold=0.01)
X_tr_var = var_threshold.fit_transform(X_tr)
X_val_var = var_threshold.transform(X_val)
X_test_var = var_threshold.transform(X_test_raw)

print(f"   After variance filter: {X_tr.shape[1]} → {X_tr_var.shape[1]} features")

# Step 2: Select top K important features
n_features_to_keep = min(80, X_tr_var.shape[1])
selector = SelectKBest(f_regression, k=n_features_to_keep)
X_tr_selected = selector.fit_transform(X_tr_var, y_tr)
X_val_selected = selector.transform(X_val_var)
X_test_selected = selector.transform(X_test_var)

print(f"   Selected top {n_features_to_keep} features for modeling")

# Update feature matrices
X_tr = X_tr_selected.copy()
X_val = X_val_selected.copy()
X_test_raw = X_test_selected.copy()

del X_tr_var, X_val_var, X_test_var, X_tr_selected, X_val_selected, X_test_selected
gc.collect()

print("✅ Feature selection complete")

# ============================================================================
# 9️⃣ ENHANCED MODEL TRAINING (TUNED HYPERPARAMS)
# ============================================================================
print("\n🚀 Training enhanced models...")

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Re-create scaled versions after feature selection
X_tr_tree = X_tr.copy()
X_val_tree = X_val.copy()
X_test_tree = X_test_raw.copy()

scaler_linear = RobustScaler()
X_tr_linear = scaler_linear.fit_transform(X_tr).astype(np.float32)
X_val_linear = scaler_linear.transform(X_val).astype(np.float32)
X_test_linear = scaler_linear.transform(X_test_raw).astype(np.float32)

# Model 1: XGBoost (ENHANCED PARAMS + EARLY STOPPING)
print("🌳 Training XGBoost with early stopping...")
xgb_model = xgb.XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.1,
    reg_lambda=0.2,
    min_child_weight=3,
    gamma=0.05,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    early_stopping_rounds=50  # XGBoost 2.0+ uses this directly
)

try:
    # Try new XGBoost 2.0+ API first
    xgb_model.fit(
        X_tr_tree, y_tr,
        eval_set=[(X_val_tree, y_val)],
        verbose=False
    )
except TypeError:
    # Fallback for older XGBoost versions
    xgb_model = xgb.XGBRegressor(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=7,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=0.2,
        min_child_weight=3,
        gamma=0.05,
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    )
    xgb_model.fit(
        X_tr_tree, y_tr,
        eval_set=[(X_val_tree, y_val)],
        callbacks=[xgb.callback.EarlyStopping(rounds=50, save_best=True)],
        verbose=False
    )

y_pred_xgb_val_log = xgb_model.predict(X_val_tree)
y_val_orig = np.expm1(y_val)
y_pred_xgb_val = np.expm1(y_pred_xgb_val_log)
xgb_rmse = rmse(y_val_orig, y_pred_xgb_val)

# Check if model has best_iteration attribute
if hasattr(xgb_model, 'best_iteration') and xgb_model.best_iteration > 0:
    print(f"   ✅ XGBoost RMSE: {xgb_rmse:.4f} (stopped at {xgb_model.best_iteration} trees)")
else:
    print(f"   ✅ XGBoost RMSE: {xgb_rmse:.4f}")

# Model 2: LightGBM (ENHANCED PARAMS + EARLY STOPPING)
print("💡 Training LightGBM with early stopping...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.15,
    reg_lambda=0.15,
    min_child_samples=20,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

try:
    # Try with callbacks (LightGBM 4.0+)
    lgb_model.fit(
        X_tr_tree, y_tr,
        eval_set=[(X_val_tree, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
except Exception:
    # Fallback for older versions
    lgb_model.fit(
        X_tr_tree, y_tr,
        eval_set=[(X_val_tree, y_val)],
        eval_metric='rmse'
    )

y_pred_lgb_val_log = lgb_model.predict(X_val_tree)
y_pred_lgb_val = np.expm1(y_pred_lgb_val_log)
lgb_rmse = rmse(y_val_orig, y_pred_lgb_val)

# Check if model has best_iteration attribute
if hasattr(lgb_model, 'best_iteration_') and lgb_model.best_iteration_ > 0:
    print(f"   ✅ LightGBM RMSE: {lgb_rmse:.4f} (stopped at {lgb_model.best_iteration_} trees)")
else:
    print(f"   ✅ LightGBM RMSE: {lgb_rmse:.4f}")

# Model 3: Ridge (TUNED ALPHA)
print("📐 Training Ridge with optimized alpha...")
best_ridge_rmse = float('inf')
best_ridge_model = None
best_alpha = 10.0

for alpha in [1.0, 5.0, 10.0, 20.0, 50.0]:
    ridge_temp = Ridge(alpha=alpha, random_state=42)
    ridge_temp.fit(X_tr_linear, y_tr)
    y_pred_temp_log = ridge_temp.predict(X_val_linear)
    y_pred_temp = np.clip(np.expm1(y_pred_temp_log), 0, 1e6)
    temp_rmse = rmse(y_val_orig, y_pred_temp)

    if temp_rmse < best_ridge_rmse:
        best_ridge_rmse = temp_rmse
        best_ridge_model = ridge_temp
        best_alpha = alpha

ridge_model = best_ridge_model
y_pred_ridge_val_log = ridge_model.predict(X_val_linear)
y_pred_ridge_val = np.clip(np.expm1(y_pred_ridge_val_log), 0, 1e6)
ridge_rmse = best_ridge_rmse
print(f"   ✅ Ridge RMSE: {ridge_rmse:.4f} (alpha={best_alpha})")

# Model 4: ElasticNet (TUNED PARAMS + INCREASED MAX_ITER)
print("🎯 Training ElasticNet with optimized params...")
best_elastic_rmse = float('inf')
best_elastic_model = None
best_params = (0.01, 0.5)

for alpha in [0.001, 0.01, 0.1]:
    for l1_ratio in [0.2, 0.5, 0.8]:
        elastic_temp = ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            random_state=42,
            max_iter=3000,  # Increased from 1500 to prevent convergence warnings
            tol=1e-4
        )
        elastic_temp.fit(X_tr_linear, y_tr)
        y_pred_temp_log = elastic_temp.predict(X_val_linear)
        y_pred_temp = np.clip(np.expm1(y_pred_temp_log), 0, 1e6)
        temp_rmse = rmse(y_val_orig, y_pred_temp)

        if temp_rmse < best_elastic_rmse:
            best_elastic_rmse = temp_rmse
            best_elastic_model = elastic_temp
            best_params = (alpha, l1_ratio)

elastic_model = best_elastic_model
y_pred_elastic_val_log = elastic_model.predict(X_val_linear)
y_pred_elastic_val = np.clip(np.expm1(y_pred_elastic_val_log), 0, 1e6)
elastic_rmse = best_elastic_rmse

# Check for convergence warnings
if not elastic_model.n_iter_ >= elastic_model.max_iter:
    print(f"   ✅ ElasticNet RMSE: {elastic_rmse:.4f} (alpha={best_params[0]}, l1={best_params[1]})")
else:
    print(f"   ⚠️  ElasticNet RMSE: {elastic_rmse:.4f} (alpha={best_params[0]}, l1={best_params[1]}, converged={elastic_model.n_iter_}/{elastic_model.max_iter})")

# ============================================================================
# 🔟 ENHANCED ENSEMBLE (LOG-SPACE BLENDING + REGULARIZATION)
# ============================================================================
print("\n🤝 Building enhanced ensemble with log-space blending...")

# Store log-space predictions for safer blending
model_predictions_log = {
    'XGB': y_pred_xgb_val_log,
    'LGB': y_pred_lgb_val_log,
    'Ridge': y_pred_ridge_val_log,
    'Elastic': y_pred_elastic_val_log
}

model_predictions = {
    'XGB': y_pred_xgb_val,
    'LGB': y_pred_lgb_val,
    'Ridge': y_pred_ridge_val,
    'Elastic': y_pred_elastic_val
}

model_rmse = {
    'XGB': xgb_rmse,
    'LGB': lgb_rmse,
    'Ridge': ridge_rmse,
    'Elastic': elastic_rmse
}

# Filter valid models
valid_models = [(name, pred_log, pred, rmse_val)
                for name, pred_log, pred, rmse_val in
                [(k, model_predictions_log[k], model_predictions[k], model_rmse[k])
                 for k in model_predictions.keys()]
                if rmse_val < 1e6]

print(f"   Valid models: {len(valid_models)}")

# Calculate regularized weights (inverse RMSE with epsilon)
eps = 1e-6
total_inv_rmse = sum(1 / (m[3] + eps) for m in valid_models)
weights = {m[0]: (1 / (m[3] + eps)) / total_inv_rmse for m in valid_models}

print(f"   Regularized weights: {weights}")

# CRITICAL: Blend in log space for numerical stability
y_ensemble_val_log = sum(weights[m[0]] * m[1] for m in valid_models)
y_ensemble_val = np.expm1(y_ensemble_val_log)
y_ensemble_val = np.clip(y_ensemble_val, 0, 1e6)  # Safety

ensemble_rmse = rmse(y_val_orig, y_ensemble_val)
ensemble_mae = mean_absolute_error(y_val_orig, y_ensemble_val)

print(f"\n📊 ENHANCED VALIDATION RESULTS:")
print(f"   XGBoost:    {xgb_rmse:.4f}")
print(f"   LightGBM:   {lgb_rmse:.4f}")
print(f"   Ridge:      {ridge_rmse:.4f}")
print(f"   ElasticNet: {elastic_rmse:.4f}")
print(f"   ─────────────────────")
print(f"   ENSEMBLE:   {ensemble_rmse:.4f} ⭐")
print(f"   MAE:        {ensemble_mae:.4f}")

# Calculate improvement
best_individual = min(model_rmse.values())
improvement = ((best_individual - ensemble_rmse) / best_individual) * 100
print(f"   Improvement: {improvement:.2f}% better than best model")

# ============================================================================
# 1️⃣1️⃣ ENHANCED FINAL PREDICTIONS (LOG-SPACE BLENDING)
# ============================================================================
print("\n🎯 Generating enhanced final predictions...")

# Predict in chunks to avoid RAM issues
def safe_predict_log(model, X, chunk_size=10000):
    """Predict in log space with chunking"""
    preds = []
    for i in range(0, len(X), chunk_size):
        chunk = X[i:i+chunk_size]
        pred = model.predict(chunk)
        preds.extend(pred)
        if i % 50000 == 0 and i > 0:
            print(f"      {i}/{len(X)} done")
            gc.collect()
    return np.array(preds)

# Generate test predictions in LOG SPACE
print("   XGBoost predictions (log-space)...")
xgb_test_log = safe_predict_log(xgb_model, X_test_tree)

print("   LightGBM predictions (log-space)...")
lgb_test_log = safe_predict_log(lgb_model, X_test_tree)

print("   Ridge predictions (log-space)...")
ridge_test_log = safe_predict_log(ridge_model, X_test_linear)

print("   ElasticNet predictions (log-space)...")
elastic_test_log = safe_predict_log(elastic_model, X_test_linear)

# CRITICAL: Blend in log space
print("\n   Blending in log-space for numerical stability...")
final_preds_log = (
    weights.get('XGB', 0) * xgb_test_log +
    weights.get('LGB', 0) * lgb_test_log +
    weights.get('Ridge', 0) * ridge_test_log +
    weights.get('Elastic', 0) * elastic_test_log
)

# Convert back to original scale
final_preds = np.expm1(final_preds_log)

# Safety checks
final_preds = np.clip(final_preds, 0.01, 1e6)  # Reasonable price range
final_preds = np.nan_to_num(final_preds, nan=10.0)  # Replace any NaNs

print(f"✅ Log-space blending complete")

# ============================================================================
# 1️⃣2️⃣ CREATE SUBMISSION
# ============================================================================
print("\n💾 Creating submission...")

submission = pd.DataFrame({
    'sample_id': test_ids,
    'price': final_preds
})

print(f"\n✅ SUBMISSION STATS:")
print(f"   Samples: {len(submission):,}")
print(f"   Price range: ${submission['price'].min():.2f} - ${submission['price'].max():.2f}")
print(f"   Mean: ${submission['price'].mean():.2f}")
print(f"   Median: ${submission['price'].median():.2f}")
print(f"   Valid prices: {(submission['price'] > 0).sum()}/{len(submission)}")

# Save
submission.to_csv('enhanced_bert_ensemble_v2.csv', index=False)
print(f"\n🎉 SAVED: enhanced_bert_ensemble_v2.csv")

# Cleanup
del X_train_raw, X_test_raw, X_tr, X_val, y_tr, y_val
del X_tr_tree, X_val_tree, X_test_tree
del X_tr_linear, X_val_linear, X_test_linear
gc.collect()

print("\n" + "="*60)
print("✅ ENHANCED PIPELINE COMPLETE!")
print("="*60)
print(f"🎯 Key Improvements Applied:")
print(f"   ✓ Feature selection (top {n_features_to_keep} features)")
print(f"   ✓ Early stopping for tree models")
print(f"   ✓ Hyperparameter tuning (Ridge alpha, ElasticNet)")
print(f"   ✓ Log-space blending (safer ensemble)")
print(f"   ✓ Regularized ensemble weights (eps={eps})")
print(f"   ✓ {improvement:.2f}% improvement over best individual model")
print("="*60)