# Ensemble Quantile Regression Notebook
*Generated: 2025-06-02 17:40 UTC*

This notebook trains LightGBM + CatBoost quantile regressors, bags multiple seeds, applies conformal calibration, and writes a submission CSV in **assets/**.

In [2]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from catboost import CatBoostRegressor

# reproducibility
SEEDS = [0, 1, 2, 3, 4]
RANDOM_STATE = 42

# Paths
NB_DIR   = Path.cwd()          # assumes notebook lives in code files/
ROOT_DIR = NB_DIR.parent
DATA_DIR = ROOT_DIR / 'dataset'
ASSETS   = ROOT_DIR / 'assets'
ASSETS.mkdir(exist_ok=True)


In [3]:
ID = 'id'
TARGET = 'sale_price'

train_df = pd.read_csv(DATA_DIR / 'dataset.csv')
test_df  = pd.read_csv(DATA_DIR / 'test.csv')
print(train_df.shape, test_df.shape)


(200000, 47) (200000, 46)


In [4]:
def enrich(df):
    # log area
    df['log_area'] = np.log1p(df['area'])
    # distance to Seattle CBD
    lat0, lon0 = 47.6097, -122.3331
    df['dist_cbd_km'] = np.sqrt((111*(df['latitude']-lat0))**2 +
                                (85*(df['longitude']-lon0))**2)
    # sale_warning as category
    df['sale_warning'] = df['sale_warning'].astype(str).fillna('missing')
    # sale_nbr numeric
    df['sale_nbr'] = pd.to_numeric(df['sale_nbr'], errors='coerce')
    df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)
    return df

train_df = enrich(train_df)
test_df  = enrich(test_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sale_nbr'].fillna(df['sale_nbr'].median(), inplace=True)


In [5]:
def build_preprocessor(df):
    num_cols = df.select_dtypes(['int64','float64']).columns.drop([ID, TARGET], errors='ignore')
    cat_cols = df.select_dtypes(['object','category']).columns

    # remove engineered cols from scaling
    num_scaled = num_cols.drop(['log_area','dist_cbd_km'], errors='ignore')

    numeric_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    pre = ColumnTransformer([
        ('num', numeric_pipe, num_scaled),
        ('cat', cat_pipe, cat_cols),
        ('direct', 'passthrough', ['log_area','dist_cbd_km'])
    ])

    return pre

pre = build_preprocessor(train_df)


In [6]:
FOLDS = 5
cv = KFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
train_idx, val_idx = next(cv.split(train_df))
X_train = train_df.iloc[train_idx].copy()
X_val   = train_df.iloc[val_idx].copy()
y_train = X_train.pop(TARGET)
y_val   = X_val.pop(TARGET)

X_train_t = pre.fit_transform(X_train)
X_val_t   = pre.transform(X_val)
X_test_t  = pre.transform(test_df)
print('Matrices:', X_train_t.shape, X_test_t.shape)


Matrices: (160000, 47) (200000, 47)


In [7]:
def fit_lgb_quantile(X, y, a_lo=0.05, a_hi=0.95, random_state=0):
    params = dict(
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=-1,
        num_leaves=256,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=random_state
    )
    lo = lgb.LGBMRegressor(objective='quantile', alpha=a_lo, **params)
    hi = lgb.LGBMRegressor(objective='quantile', alpha=a_hi, **params)
    lo.fit(X, y)
    hi.fit(X, y)
    return lo, hi


In [8]:
preds_lo_test, preds_hi_test = [], []
preds_lo_val,  preds_hi_val  = [], []

for seed in SEEDS:
    lo, hi = fit_lgb_quantile(X_train_t, y_train, 0.05, 0.95, seed)
    preds_lo_test.append(lo.predict(X_test_t))
    preds_hi_test.append(hi.predict(X_test_t))
    preds_lo_val.append(lo.predict(X_val_t))
    preds_hi_val.append(hi.predict(X_val_t))

lgb_lo_test = np.mean(preds_lo_test, axis=0)
lgb_hi_test = np.mean(preds_hi_test, axis=0)
lgb_lo_val  = np.mean(preds_lo_val,  axis=0)
lgb_hi_val  = np.mean(preds_hi_val,  axis=0)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4154
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1435000.000000




In [9]:
cat_lo = CatBoostRegressor(loss_function='Quantile:alpha=0.05',
                            iterations=1300, depth=8,
                            learning_rate=0.03,
                            random_seed=RANDOM_STATE,
                            verbose=False)
cat_hi = CatBoostRegressor(loss_function='Quantile:alpha=0.95',
                            iterations=1300, depth=8,
                            learning_rate=0.03,
                            random_seed=RANDOM_STATE,
                            verbose=False)

cat_lo.fit(X_train_t, y_train)
cat_hi.fit(X_train_t, y_train)

cat_lo_test = cat_lo.predict(X_test_t)
cat_hi_test = cat_hi.predict(X_test_t)
cat_lo_val  = cat_lo.predict(X_val_t)
cat_hi_val  = cat_hi.predict(X_val_t)


In [10]:
pi_lower_raw_val = 0.5*lgb_lo_val + 0.5*cat_lo_val
pi_upper_raw_val = 0.5*lgb_hi_val + 0.5*cat_hi_val

pi_lower_raw_test = 0.5*lgb_lo_test + 0.5*cat_lo_test
pi_upper_raw_test = 0.5*lgb_hi_test + 0.5*cat_hi_test


In [11]:
alpha = 0.10  # target 90% coverage
scores = np.maximum(y_val - pi_upper_raw_val,
                    pi_lower_raw_val - y_val)
q_hat = np.quantile(scores, 1 - alpha)
print('q_hat:', q_hat)

pi_lower = (pi_lower_raw_test - q_hat).clip(min=0)
pi_upper = np.maximum(pi_upper_raw_test + q_hat, pi_lower)

coverage_val = ((y_val >= pi_lower_raw_val - q_hat) &
                (y_val <= pi_upper_raw_val + q_hat)).mean()
print(f'Post-CQR validation coverage: {coverage_val:.3%}')


q_hat: 8437.936817926668
Post-CQR validation coverage: 90.000%


In [13]:
import numpy as np

def winkler(y, lo, hi, alpha=0.10):
    """
    Vectorised Winkler interval score.
    Accepts pandas Series or NumPy arrays of the same length.
    """
    y  = np.asarray(y)
    lo = np.asarray(lo)
    hi = np.asarray(hi)

    width   = hi - lo
    over_lo = np.clip(lo - y, 0, None)   # only the part where y < lo
    over_hi = np.clip(y - hi, 0, None)   # only the part where y > hi
    penalty = (2 / alpha) * (over_lo + over_hi)

    return width + penalty



In [14]:
val_winkler = winkler(y_val, 
                      pi_lower_raw_val - q_hat, 
                      pi_upper_raw_val + q_hat).mean()

print(f"Mean Winkler (val): {val_winkler:,.0f}")


Mean Winkler (val): 337,219


In [None]:
sub = pd.DataFrame({
    ID: test_df[ID],
    'pi_lower': pi_lower,
    'pi_upper': pi_upper
})
csv_path = ASSETS / 'ensemble_cqr_lgb_cat_v1.csv'
sub.to_csv(csv_path, index=False)
print('Saved submission to', csv_path)
