# Ensemble Quantile Regression with Weight Search
*Generated: 2025-06-02 17:51 UTC*

In [1]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from catboost import CatBoostRegressor

SEEDS = [0,1,2,3,4]
RANDOM_STATE = 42

NB_DIR = Path.cwd()
ROOT_DIR = NB_DIR.parent
DATA_DIR = ROOT_DIR / 'dataset'
ASSETS = ROOT_DIR / 'assets'
ASSETS.mkdir(exist_ok=True)


In [2]:
ID = 'id'; TARGET = 'sale_price'
train_df = pd.read_csv(DATA_DIR/'dataset.csv')
test_df  = pd.read_csv(DATA_DIR/'test.csv')
print(train_df.shape, test_df.shape)


(200000, 47) (200000, 46)


In [3]:
def enrich(df):
    df['log_area'] = np.log1p(df['area'])
    lat0, lon0 = 47.6097, -122.3331
    df['dist_cbd_km'] = np.sqrt((111*(df['latitude']-lat0))**2 +
                                (85*(df['longitude']-lon0))**2)
    df['sale_warning'] = df['sale_warning'].astype(str).fillna('missing')
    df['sale_nbr'] = pd.to_numeric(df['sale_nbr'], errors='coerce')
    df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)
    return df

train_df = enrich(train_df)
test_df  = enrich(test_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)


In [4]:
def build_preprocessor(df):
    num_cols = df.select_dtypes(['int64','float64']).columns.drop([ID,TARGET], errors='ignore')
    cat_cols = df.select_dtypes(['object','category']).columns
    num_scaled = num_cols.drop(['log_area','dist_cbd_km'], errors='ignore')
    numeric_pipe = Pipeline([('imp', SimpleImputer(strategy='median')),
                             ('sc', StandardScaler())])
    cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')),
                         ('enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
    pre = ColumnTransformer([
        ('num', numeric_pipe, num_scaled),
        ('cat', cat_pipe, cat_cols),
        ('pas', 'passthrough', ['log_area','dist_cbd_km'])
    ])
    return pre

pre = build_preprocessor(train_df)


In [5]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
train_idx, val_idx = next(cv.split(train_df))
X_train = train_df.iloc[train_idx].copy()
X_val   = train_df.iloc[val_idx].copy()
y_train = X_train.pop(TARGET)
y_val   = X_val.pop(TARGET)

X_train_t = pre.fit_transform(X_train)
X_val_t   = pre.transform(X_val)
X_test_t  = pre.transform(test_df)


In [6]:
def fit_lgb_quantile(X, y, a_lo=0.05, a_hi=0.95, random_state=0):
    params = dict(
        n_estimators=1200, learning_rate=0.03, max_depth=-1,
        num_leaves=256, subsample=0.9, colsample_bytree=0.9,
        random_state=random_state
    )
    lo = lgb.LGBMRegressor(objective='quantile', alpha=a_lo, **params)
    hi = lgb.LGBMRegressor(objective='quantile', alpha=a_hi, **params)
    lo.fit(X, y); hi.fit(X, y)
    return lo, hi


In [7]:
preds_lo_test, preds_hi_test = [], []
preds_lo_val,  preds_hi_val  = [], []

for seed in SEEDS:
    lo, hi = fit_lgb_quantile(X_train_t, y_train, 0.05, 0.95, seed)
    preds_lo_test.append(lo.predict(X_test_t))
    preds_hi_test.append(hi.predict(X_test_t))
    preds_lo_val.append(lo.predict(X_val_t))
    preds_hi_val.append(hi.predict(X_val_t))

lgb_lo_test = np.mean(preds_lo_test, axis=0)
lgb_hi_test = np.mean(preds_hi_test, axis=0)
lgb_lo_val  = np.mean(preds_lo_val,  axis=0)
lgb_hi_val  = np.mean(preds_hi_val,  axis=0)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




In [8]:
cat_lo = CatBoostRegressor(loss_function='Quantile:alpha=0.05',
                            iterations=1300, depth=8, learning_rate=0.03,
                            random_seed=RANDOM_STATE, verbose=False)
cat_hi = CatBoostRegressor(loss_function='Quantile:alpha=0.95',
                            iterations=1300, depth=8, learning_rate=0.03,
                            random_seed=RANDOM_STATE, verbose=False)
cat_lo.fit(X_train_t, y_train)
cat_hi.fit(X_train_t, y_train)

cat_lo_test = cat_lo.predict(X_test_t)
cat_hi_test = cat_hi.predict(X_test_t)
cat_lo_val  = cat_lo.predict(X_val_t)
cat_hi_val  = cat_hi.predict(X_val_t)


In [9]:
def winkler(y, lo, hi, alpha=0.10):
    y, lo, hi = map(np.asarray, (y, lo, hi))
    width = hi - lo
    penalty = (2/alpha)*(np.clip(lo - y, 0, None) + np.clip(y - hi, 0, None))
    return width + penalty

best_w, best_wink, best_q = None, 1e12, None
for w in np.linspace(0,1,21):
    lo_val = w*lgb_lo_val + (1-w)*cat_lo_val
    hi_val = w*lgb_hi_val + (1-w)*cat_hi_val
    scores = np.maximum(y_val - hi_val, lo_val - y_val)
    q_hat_tmp = np.quantile(scores, 0.90)
    wink = winkler(y_val, lo_val - q_hat_tmp, hi_val + q_hat_tmp).mean()
    if wink < best_wink:
        best_w, best_wink, best_q = w, wink, q_hat_tmp

print(f'Best weight: {best_w:.2f}, Winkler: {best_wink:,.0f}')

# Apply best weight
pi_lower_raw_val = best_w*lgb_lo_val + (1-best_w)*cat_lo_val
pi_upper_raw_val = best_w*lgb_hi_val + (1-best_w)*cat_hi_val
pi_lower_raw_test = best_w*lgb_lo_test + (1-best_w)*cat_lo_test
pi_upper_raw_test = best_w*lgb_hi_test + (1-best_w)*cat_hi_test
q_hat = best_q


Best weight: 0.50, Winkler: 337,219


In [10]:
alpha = 0.10
pi_lower = (pi_lower_raw_test - q_hat).clip(min=0)
pi_upper = np.maximum(pi_upper_raw_test + q_hat, pi_lower)

coverage_val = ((y_val >= pi_lower_raw_val - q_hat) &
                (y_val <= pi_upper_raw_val + q_hat)).mean()
val_wink = winkler(y_val, pi_lower_raw_val - q_hat, pi_upper_raw_val + q_hat).mean()
print(f'Validation coverage: {coverage_val:.3%}, Winkler: {val_wink:,.0f}')


Validation coverage: 90.000%, Winkler: 337,219


In [11]:
sub = pd.DataFrame({ID: test_df[ID],
                    'pi_lower': pi_lower,
                    'pi_upper': pi_upper})
csv_path = ASSETS/'ensemble_weightsearch_cqr_june2.csv'
sub.to_csv(csv_path, index=False)
print('Saved submission:', csv_path)


Saved submission: e:\Hackathons\Kaggle Prediction interval competition II House price\assets\ensemble_weightsearch_cqr_june2.csv
