In [None]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
import warnings, gc

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ==========================================
# 1. Data Loading + Q99 Outlier Clipping
# ==========================================
train = pd.read_csv('/kaggle/input/ai-real-camp-3/train_car.csv')
test = pd.read_csv('/kaggle/input/ai-real-camp-3/test_car.csv')

# Fix int32 overflow mileage
train.loc[train['mileage_km'] > 900_000, 'mileage_km'] = np.nan
test.loc[test['mileage_km'] > 900_000, 'mileage_km'] = np.nan

# Q99 clipping on price
q01_price = train['price'].quantile(0.01)
q99_price = train['price'].quantile(0.99)
train['price'] = train['price'].clip(q01_price, q99_price)

# Q99 clipping on mileage
q99_mileage = train['mileage_km'].quantile(0.99)
train['mileage_km'] = train['mileage_km'].clip(upper=q99_mileage)
test['mileage_km'] = test['mileage_km'].clip(upper=q99_mileage)

# Q99 clipping on tax_hp
train['tax_hp'] = pd.to_numeric(train['tax_hp'], errors='coerce')
test['tax_hp'] = pd.to_numeric(test['tax_hp'], errors='coerce')
q99_hp = train['tax_hp'].quantile(0.99)
train['tax_hp'] = train['tax_hp'].clip(upper=q99_hp)
test['tax_hp'] = test['tax_hp'].clip(upper=q99_hp)

print(f"Train: {train.shape}, Test: {test.shape} — ALL rows kept, Q99 clipped")

# ==========================================
# 2. Feature Engineering
# ==========================================
def preprocess(df, ref_df=None):
    df = df.copy()
    if ref_df is None:
        ref_df = df

    # Date features
    df['listing_date'] = pd.to_datetime(df['listing_date'])
    df['listing_year'] = df['listing_date'].dt.year
    df['listing_month'] = df['listing_date'].dt.month
    df['listing_dayofweek'] = df['listing_date'].dt.dayofweek
    df['listing_quarter'] = df['listing_date'].dt.quarter
    df['car_age'] = (df['listing_year'] - df['year']).clip(lower=0)

    # Brand / submodel
    df['model'] = df['model'].astype(str).replace('nan', 'Missing')
    df['brand'] = df['model'].str.split(' ').str[0]
    df['submodel'] = df['model'].str.split(' ', n=1).str[1].fillna('Missing')

    # Transmission & Fuel encoding
    trans_map = {'Manuelle': 0, 'Automatique': 1}
    df['transmission_enc'] = df['transmission'].map(trans_map).fillna(0)
    fuel_map = {'Essence': 0, 'Diesel': 1, 'Hybride': 2, 'Electrique': 3}
    df['fuel_enc'] = df['fuel_type'].map(fuel_map).fillna(0)

    # CatBoost native categoricals
    df['transmission_cat'] = df['transmission'].astype(str)
    df['fuel_cat'] = df['fuel_type'].astype(str)

    # Imputation: cascade model -> brand -> global median
    df['tax_hp'] = pd.to_numeric(df['tax_hp'], errors='coerce')
    for col_val in ['tax_hp', 'mileage_km']:
        model_med = ref_df.groupby('model')[col_val].median()
        brand_med = ref_df.groupby('brand')[col_val].median()
        df[col_val] = df[col_val].fillna(df['model'].map(model_med)).fillna(
            df['brand'].map(brand_med)).fillna(ref_df[col_val].median())

    # Doors imputation
    df['doors'] = pd.to_numeric(df['doors'], errors='coerce')
    df['doors_missing'] = df['doors'].isna().astype(int)
    model_doors = ref_df.groupby('model')['doors'].agg(
        lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else np.nan)
    brand_doors = ref_df.groupby('brand')['doors'].agg(
        lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else np.nan)
    df['doors'] = df['doors'].fillna(df['model'].map(model_doors)).fillna(
        df['brand'].map(brand_doors)).fillna(5)

    # Interaction features
    df['mileage_per_year'] = df['mileage_km'] / (df['car_age'] + 1)
    df['hp_per_age'] = df['tax_hp'] / (df['car_age'] + 1)
    df['log_mileage'] = np.log1p(df['mileage_km'])
    df['log_hp'] = np.log1p(df['tax_hp'])
    df['year_squared'] = df['year'] ** 2
    df['age_squared'] = df['car_age'] ** 2
    df['hp_squared'] = df['tax_hp'] ** 2
    df['mileage_x_hp'] = df['mileage_km'] * df['tax_hp']
    df['mileage_per_hp'] = df['mileage_km'] / (df['tax_hp'] + 1)
    df['is_new'] = (df['car_age'] <= 1).astype(int)
    df['depreciation_rate'] = df['mileage_km'] / (df['car_age'] + 1) / (df['tax_hp'] + 1)
    df['hp_density'] = df['tax_hp'] / (df['doors'] + 1)
    df['age_mileage_ratio'] = df['car_age'] / (df['mileage_km'] / 10000 + 1)
    df['is_diesel_auto'] = ((df['fuel_enc'] == 1) & (df['transmission_enc'] == 1)).astype(int)
    df['wear_index'] = df['mileage_km'] * df['car_age']

    # Luxury flag
    luxury = ['MERCEDES-BENZ', 'BMW', 'AUDI', 'PORSCHE', 'LAND-ROVER',
              'JAGUAR', 'VOLVO', 'MASERATI', 'BENTLEY', 'LEXUS', 'INFINITI']
    df['is_luxury'] = df['brand'].isin(luxury).astype(int)

    # Frequency encoding
    for col in ['brand', 'model', 'submodel']:
        freq = ref_df[col].astype(str).value_counts(normalize=True)
        df[col + '_freq'] = df[col].astype(str).map(freq).fillna(0)

    # Brand-level group stats
    for stat_col in ['mileage_km', 'tax_hp']:
        brand_stat = ref_df.groupby('brand')[stat_col].median()
        df[f'brand_med_{stat_col}'] = df['brand'].map(brand_stat).fillna(ref_df[stat_col].median())

    for col in ['model', 'brand', 'submodel']:
        df[col] = df[col].astype(str).replace('nan', 'Missing')

    return df

train_clean = preprocess(train)
test_clean = preprocess(test, ref_df=train_clean)

# ==========================================
# 3. Target Encoding (leak-free)
# ==========================================
def target_encode(tr, va, te, col, target, smooth=20):
    global_mean = target.mean()
    agg = target.groupby(tr[col]).agg(['count', 'mean'])
    smooth_means = (agg['count'] * agg['mean'] + smooth * global_mean) / (agg['count'] + smooth)
    tr_enc = tr[col].map(smooth_means).fillna(global_mean)
    va_enc = va[col].map(smooth_means).fillna(global_mean)
    te_enc = te[col].map(smooth_means).fillna(global_mean) if te is not None else None
    return tr_enc, va_enc, te_enc

# ==========================================
# Feature Definitions
# ==========================================
cb_cat_features = ['model', 'transmission_cat', 'fuel_cat']

num_features = [
    'mileage_km', 'year', 'tax_hp', 'car_age',
    'listing_month', 'listing_dayofweek', 'listing_quarter',
    'mileage_per_year', 'hp_per_age', 'log_mileage', 'log_hp',
    'year_squared', 'age_squared', 'hp_squared',
    'mileage_x_hp', 'mileage_per_hp',
    'doors', 'doors_missing', 'transmission_enc', 'fuel_enc',
    'is_luxury', 'is_new', 'is_diesel_auto',
    'depreciation_rate', 'hp_density', 'age_mileage_ratio', 'wear_index',
    'brand_freq', 'model_freq', 'submodel_freq',
    'brand_med_mileage_km', 'brand_med_tax_hp',
]

cb_features = num_features + cb_cat_features
encoded_features = num_features + ['model_enc', 'brand_enc', 'submodel_enc']

y_log = np.log1p(train_clean['price'])
y_log.name = 'price_log'

print(f"Features: CB={len(cb_features)}, Encoded={len(encoded_features)}")

# ==========================================
# 4. OPTUNA TUNING (50 trials, all 4 models)
# ==========================================
print("\n" + "=" * 40)
print("  OPTUNA TUNING (50 trials each, 4 models)")
print("=" * 40)

N_TRIALS = 50
X_tune_tr, X_tune_va, y_tune_tr, y_tune_va = train_test_split(
    train_clean, y_log, test_size=0.2, random_state=42)

X_tune_tr_enc, X_tune_va_enc = X_tune_tr.copy(), X_tune_va.copy()
for col in ['model', 'brand', 'submodel']:
    X_tune_tr_enc[col+'_enc'], X_tune_va_enc[col+'_enc'], _ = target_encode(
        X_tune_tr, X_tune_va, None, col, y_tune_tr)

# --- CatBoost ---
def obj_cb(trial):
    params = {
        'iterations': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 20, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 2),
        'random_strength': trial.suggest_float('random_strength', 0, 3),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'loss_function': 'RMSE', 'verbose': 0,
        'cat_features': cb_cat_features, 'random_seed': 42,
    }
    m = CatBoostRegressor(**params)
    m.fit(X_tune_tr[cb_features], y_tune_tr,
          eval_set=(X_tune_va[cb_features], y_tune_va), early_stopping_rounds=150)
    return np.sqrt(mean_squared_error(y_tune_va, m.predict(X_tune_va[cb_features])))

print("Tuning CatBoost (50 trials)...")
study_cb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_cb.optimize(obj_cb, n_trials=N_TRIALS)
best_cb = study_cb.best_params
best_cb.update({'iterations': 10000, 'cat_features': cb_cat_features,
                'loss_function': 'RMSE', 'verbose': 0, 'random_seed': 42})
print(f"  Best CB: {study_cb.best_value:.5f}")

# --- LightGBM ---
def obj_lgb(trial):
    params = {
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 255),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 100, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 100, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'verbose': -1, 'random_state': 42, 'n_jobs': -1,
    }
    m = LGBMRegressor(**params)
    m.fit(X_tune_tr_enc[encoded_features], y_tune_tr,
          eval_set=[(X_tune_va_enc[encoded_features], y_tune_va)],
          callbacks=[__import__('lightgbm').early_stopping(150, verbose=False)])
    return np.sqrt(mean_squared_error(y_tune_va, m.predict(X_tune_va_enc[encoded_features])))

print("Tuning LightGBM (50 trials)...")
study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_lgb.optimize(obj_lgb, n_trials=N_TRIALS)
best_lgb = study_lgb.best_params
best_lgb.update({'n_estimators': 10000, 'verbose': -1, 'random_state': 42, 'n_jobs': -1})
print(f"  Best LGB: {study_lgb.best_value:.5f}")

# --- XGBoost ---
def obj_xgb(trial):
    params = {
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 100, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 100, log=True),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'early_stopping_rounds': 150,
        'n_jobs': -1, 'verbose': 0, 'random_state': 42,
    }
    m = XGBRegressor(**params)
    m.fit(X_tune_tr_enc[encoded_features], y_tune_tr,
          eval_set=[(X_tune_va_enc[encoded_features], y_tune_va)], verbose=False)
    return np.sqrt(mean_squared_error(y_tune_va, m.predict(X_tune_va_enc[encoded_features])))

print("Tuning XGBoost (50 trials)...")
study_xgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_xgb.optimize(obj_xgb, n_trials=N_TRIALS)
best_xgb = study_xgb.best_params
best_xgb.update({'n_estimators': 10000, 'n_jobs': -1, 'verbose': 0, 'random_state': 42})
print(f"  Best XGB: {study_xgb.best_value:.5f}")

# --- HistGradientBoosting (NEW: Optuna tuned) ---
def obj_hgb(trial):
    params = {
        'max_iter': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 60),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 15, 255),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.01, 50, log=True),
        'max_bins': trial.suggest_int('max_bins', 64, 255),
        'early_stopping': True, 'validation_fraction': 0.15,
        'n_iter_no_change': 150, 'random_state': 42,
    }
    m = HistGradientBoostingRegressor(**params)
    m.fit(X_tune_tr_enc[encoded_features], y_tune_tr)
    return np.sqrt(mean_squared_error(y_tune_va, m.predict(X_tune_va_enc[encoded_features])))

print("Tuning HistGradientBoosting (50 trials)...")
study_hgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_hgb.optimize(obj_hgb, n_trials=N_TRIALS)
best_hgb = study_hgb.best_params
best_hgb.update({'max_iter': 8000, 'early_stopping': True,
                 'validation_fraction': 0.1, 'n_iter_no_change': 200, 'random_state': 42})
print(f"  Best HGB: {study_hgb.best_value:.5f}")

# ==========================================
# 5. MULTI-SEED STACKING (5 seeds x 7 folds x 4 models)
# ==========================================
print("\n" + "=" * 40)
print("  MULTI-SEED STACKING (5 seeds x 7 folds)")
print("=" * 40)

SEEDS = [42, 123, 2024, 7, 999]
N_FOLDS = 7
all_seed_preds = []

for seed_idx, SEED in enumerate(SEEDS):
    print(f"\n--- Seed {SEED} ({seed_idx+1}/{len(SEEDS)}) ---")
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    oof = {k: np.zeros(len(train_clean)) for k in ['cb', 'lgb', 'xgb', 'hgb']}
    tst = {k: np.zeros(len(test_clean)) for k in ['cb', 'lgb', 'xgb', 'hgb']}

    for fold, (tr_idx, va_idx) in enumerate(kf.split(train_clean)):
        X_tr = train_clean.iloc[tr_idx].copy()
        X_va = train_clean.iloc[va_idx].copy()
        X_te = test_clean.copy()
        y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

        # Target encode inside CV (leak-free)
        for col in ['model', 'brand', 'submodel']:
            X_tr[col+'_enc'], X_va[col+'_enc'], X_te[col+'_enc'] = target_encode(
                X_tr, X_va, X_te, col, y_tr)

        # CatBoost
        cb = CatBoostRegressor(**{**best_cb, 'random_seed': SEED})
        cb.fit(X_tr[cb_features], y_tr,
               eval_set=(X_va[cb_features], y_va), early_stopping_rounds=500)
        oof['cb'][va_idx] = cb.predict(X_va[cb_features])
        tst['cb'] += cb.predict(X_te[cb_features]) / N_FOLDS

        # LightGBM
        lgb = LGBMRegressor(**{**best_lgb, 'random_state': SEED})
        lgb.fit(X_tr[encoded_features], y_tr,
                eval_set=[(X_va[encoded_features], y_va)],
                callbacks=[__import__('lightgbm').early_stopping(500, verbose=False)])
        oof['lgb'][va_idx] = lgb.predict(X_va[encoded_features])
        tst['lgb'] += lgb.predict(X_te[encoded_features]) / N_FOLDS

        # XGBoost
        xgb = XGBRegressor(**{**best_xgb, 'random_state': SEED, 'early_stopping_rounds': 500})
        xgb.fit(X_tr[encoded_features], y_tr,
                eval_set=[(X_va[encoded_features], y_va)], verbose=False)
        oof['xgb'][va_idx] = xgb.predict(X_va[encoded_features])
        tst['xgb'] += xgb.predict(X_te[encoded_features]) / N_FOLDS

        # HistGradientBoosting (now Optuna-tuned)
        hgb = HistGradientBoostingRegressor(**{**best_hgb, 'random_state': SEED})
        hgb.fit(X_tr[encoded_features], y_tr)
        oof['hgb'][va_idx] = hgb.predict(X_va[encoded_features])
        tst['hgb'] += hgb.predict(X_te[encoded_features]) / N_FOLDS

        print(f"  Fold {fold+1}/{N_FOLDS}")
        gc.collect()

    # OOF scores
    for k in oof:
        print(f"  {k.upper()}: {np.sqrt(mean_squared_error(y_log, oof[k])):.5f}", end="")
    print()

    # Ridge stacking
    X_s_tr = pd.DataFrame(oof)
    X_s_te = pd.DataFrame(tst)
    meta = Ridge(alpha=10)
    meta.fit(X_s_tr, y_log)
    meta_rmse = np.sqrt(mean_squared_error(y_log, meta.predict(X_s_tr)))

    # Nelder-Mead blend
    def opt_w(w):
        w = np.abs(w) / np.abs(w).sum()
        blend = sum(w[i] * oof[k] for i, k in enumerate(oof))
        return np.sqrt(mean_squared_error(y_log, blend))

    res = minimize(opt_w, [0.25]*4, method='Nelder-Mead')
    ww = np.abs(res.x) / np.abs(res.x).sum()
    blend_rmse = res.fun

    print(f"  Ridge: {meta_rmse:.5f} | Blend: {blend_rmse:.5f}")
    print(f"  Weights: CB={ww[0]:.3f} LGB={ww[1]:.3f} XGB={ww[2]:.3f} HGB={ww[3]:.3f}")

    if blend_rmse < meta_rmse:
        seed_pred = sum(ww[i] * tst[k] for i, k in enumerate(tst))
    else:
        seed_pred = meta.predict(X_s_te)

    all_seed_preds.append(np.expm1(seed_pred))

# ==========================================
# 6. FINAL SUBMISSION
# ==========================================
print("\n" + "=" * 40)
print("  FINAL SUBMISSION")
print("=" * 40)

final_preds = np.mean(all_seed_preds, axis=0)
final_preds = np.clip(final_preds, 5000, 5_000_000)

submission = pd.DataFrame({'id': test['id'], 'price': final_preds})
submission.to_csv('submission_stacking_fast.csv', index=False)
print(f"Saved! Range: {final_preds.min():.0f} - {final_preds.max():.0f}, Mean: {final_preds.mean():.0f}")

Loading data...

 ⚡ FAST HYPERPARAMETER TUNING 
Tuning CatBoost...
Tuning LightGBM...


Loading data...

 ⚡ FAST HYPERPARAMETER TUNING 
Tuning CatBoost...
Tuning LightGBM...


[33m[W 2026-02-13 08:56:57,278][0m Trial 0 failed with parameters: {'learning_rate': 0.05631111873741038, 'max_depth': 7, 'subsample': 0.6966448372594408, 'colsample_bytree': 0.6595334871773919} because of the following error: TypeError("XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'").[0m
Traceback (most recent call last):
  File [35m"c:\Users\DELL\Desktop\DL__SOAI\venv\Lib\site-packages\optuna\study\_optimize.py"[0m, line [35m206[0m, in [35m_run_trial[0m
    value_or_values = func(trial)
  File [35m"C:\Users\DELL\AppData\Local\Temp\ipykernel_14652\3973061318.py"[0m, line [35m175[0m, in [35mobj_xgb[0m
    [31mmodel.fit[0m[1;31m(X_tune_tr_enc[encoded_features], y_tune_tr, eval_set=[(X_tune_va_enc[encoded_features], y_tune_va)], early_stopping_rounds=20, verbose=False)[0m
    [31m~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Loading data...

 ⚡ FAST HYPERPARAMETER TUNING 
Tuning CatBoost...
Tuning LightGBM...


[33m[W 2026-02-13 08:56:57,278][0m Trial 0 failed with parameters: {'learning_rate': 0.05631111873741038, 'max_depth': 7, 'subsample': 0.6966448372594408, 'colsample_bytree': 0.6595334871773919} because of the following error: TypeError("XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'").[0m
Traceback (most recent call last):
  File [35m"c:\Users\DELL\Desktop\DL__SOAI\venv\Lib\site-packages\optuna\study\_optimize.py"[0m, line [35m206[0m, in [35m_run_trial[0m
    value_or_values = func(trial)
  File [35m"C:\Users\DELL\AppData\Local\Temp\ipykernel_14652\3973061318.py"[0m, line [35m175[0m, in [35mobj_xgb[0m
    [31mmodel.fit[0m[1;31m(X_tune_tr_enc[encoded_features], y_tune_tr, eval_set=[(X_tune_va_enc[encoded_features], y_tune_va)], early_stopping_rounds=20, verbose=False)[0m
    [31m~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Tuning XGBoost...


TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'