In [2]:
# 1️⃣ Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.preprocessing  import OrdinalEncoder, StandardScaler
from sklearn.compose        import ColumnTransformer
from sklearn.pipeline       import Pipeline
from sklearn.impute         import SimpleImputer

# Quantile regressors we’ll try in Phase 2
import lightgbm as lgb
from catboost import CatBoostRegressor


In [3]:
# 2️⃣ Path helpers (same as before)
NB_DIR   = Path.cwd()
ROOT_DIR = NB_DIR.parent
DATA_DIR = ROOT_DIR / "dataset"


In [4]:
train_df = pd.read_csv(DATA_DIR / "dataset.csv")   # has sale_price
test_df  = pd.read_csv(DATA_DIR / "test.csv")
TARGET   = "sale_price"
ID       = "id"

print(train_df.shape, test_df.shape)
train_df.head(3)


(200000, 47) (200000, 46)


Unnamed: 0,id,sale_date,sale_price,sale_nbr,sale_warning,join_status,join_year,latitude,longitude,area,...,view_olympics,view_cascades,view_territorial,view_skyline,view_sound,view_lakewash,view_lakesamm,view_otherwater,view_other,submarket
0,0,2014-11-15,236000,2.0,,nochg,2025,47.2917,-122.3658,53,...,0,0,0,0,0,0,0,0,0,I
1,1,1999-01-15,313300,,26.0,nochg,2025,47.6531,-122.1996,74,...,0,0,0,0,0,1,0,0,0,Q
2,2,2006-08-15,341000,1.0,,nochg,2025,47.4733,-122.1901,30,...,0,0,0,0,0,0,0,0,0,K


In [5]:
def load_and_clean():
    train = pd.read_csv(DATA_DIR / "dataset.csv")
    test  = pd.read_csv(DATA_DIR / "test.csv")

    # ----------  date parsing  ----------
    for df in (train, test):
        df["sale_date"] = pd.to_datetime(df["sale_date"])
        df["sale_year"]     = df["sale_date"].dt.year.astype("int16")
        df["sale_month"]    = df["sale_date"].dt.month.astype("int8")
        df["sale_quarter"]  = df["sale_date"].dt.quarter.astype("int8")
        df.drop(columns="sale_date", inplace=True)

        # ----------  numeric coercion ----------
        # (sale_nbr / sale_warning look numeric but came in as object)
        for col in ["sale_nbr", "sale_warning"]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # drop super-low-variance columns (e.g., join_year if constant 2025)
        nunique_small = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
        df.drop(columns=nunique_small, inplace=True)

    return train, test

train_df, test_df = load_and_clean()
print(train_df.shape, test_df.shape)


(200000, 49) (200000, 48)


In [6]:
def build_preprocessor(df):
    num_cols = df.select_dtypes(include=["int16", "int32", "int64",
                                         "float16", "float32", "float64", "int8"]).columns
    num_cols = num_cols.drop([ID, TARGET], errors="ignore")

    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns

    numeric_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        [("num", numeric_pipe, num_cols),
         ("cat", cat_pipe,   cat_cols)],
        remainder="drop"
    )
    return preprocessor, num_cols, cat_cols


In [7]:
FOLDS = 5
cv = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
train_idx, val_idx = next(cv.split(train_df))

X_train = train_df.iloc[train_idx].copy()
X_val   = train_df.iloc[val_idx].copy()
y_train = X_train.pop(TARGET)
y_val   = X_val.pop(TARGET)

prep, num_cols, cat_cols = build_preprocessor(train_df)

X_train_t = prep.fit_transform(X_train)
X_val_t   = prep.transform(X_val)
X_test_t  = prep.transform(test_df)


In [8]:
params = dict(
    n_estimators=1500,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.9,
    colsample_bytree=0.9,
    max_depth=-1
)

lo_m = lgb.LGBMRegressor(objective="quantile", alpha=0.025, **params)
hi_m = lgb.LGBMRegressor(objective="quantile", alpha=0.975, **params)

lo_m.fit(X_train_t, y_train)
hi_m.fit(X_train_t, y_train)

lo_pred = lo_m.predict(X_val_t)
hi_pred = hi_m.predict(X_val_t)
coverage = ((y_val >= lo_pred) & (y_val <= hi_pred)).mean()
print(f"Coverage @2.5–97.5 %: {coverage:.3%}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3546
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 161998.750000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3546
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 47
[LightGBM] [Info] Start training from score 1789000.000000




Coverage @2.5–97.5 %: 87.790%


In [9]:
def fit_lgb_quantile(X_tr, y_tr, a_lo=0.025, a_hi=0.975, random_state=0):
    common = dict(
        objective="quantile",
        n_estimators=1200,
        learning_rate=0.03,
        num_leaves=256,
        subsample=0.9,
        colsample_bytree=0.9,
        max_depth=-1,
        random_state=random_state,
    )
    lo = lgb.LGBMRegressor(alpha=a_lo, **common)
    hi = lgb.LGBMRegressor(alpha=a_hi, **common)
    lo.fit(X_tr, y_tr)
    hi.fit(X_tr, y_tr)
    return lo, hi
