In [11]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb

In [12]:

# 1️⃣  Resolve paths ------------------------------------------------------------
NB_DIR   = Path.cwd()                  # .../code files
ROOT_DIR = NB_DIR.parent               # project root
DATA_DIR = ROOT_DIR / "dataset"

# 2️⃣  Load the data ------------------------------------------------------------
train_path = DATA_DIR / "dataset.csv"
test_path  = DATA_DIR / "test.csv"

df   = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print(f"Train shape : {df.shape}")
print(f"Test  shape : {test.shape}")


Train shape : (200000, 47)
Test  shape : (200000, 46)


In [13]:
y = df.pop("sale_price")
X = df.copy()
X_test = test.copy()

# 2. minimal preprocessing
cat_cols = X.select_dtypes("object").columns
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[cat_cols]      = enc.fit_transform(X[cat_cols])
X_test[cat_cols] = enc.transform(X_test[cat_cols])

In [16]:
# 3. train two quantile models
params = dict(
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=1200,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
)

train, val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model_lo = lgb.LGBMRegressor(objective="quantile", alpha=0.05, **params)
model_hi = lgb.LGBMRegressor(objective="quantile", alpha=0.95, **params)

model_lo.fit(train, y_train, eval_set=[(val, y_val)])
model_hi.fit(train, y_train, eval_set=[(val, y_val)])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4037
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 46
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4037
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 46
[LightGBM] [Info] Start training from score 1435000.000000


In [17]:
# 4. predict
pi_lower = model_lo.predict(X_test)
pi_upper = model_hi.predict(X_test)

# 5. clip so lower<=upper
pi_lower, pi_upper = pi_lower.clip(min=0), pi_upper.clip(min=pi_lower)

In [18]:
OUT_DIR = ROOT_DIR / "assets"
OUT_DIR.mkdir(exist_ok=True)

sub = pd.DataFrame({"id": test.id, "pi_lower": pi_lower, "pi_upper": pi_upper})
sub.to_csv(OUT_DIR / "lgb_quantile_baseline_june2.csv", index=False)

In [19]:
# quick 90 % coverage sanity-check on the validation fold
y_hat_lo = model_lo.predict(val)
y_hat_hi = model_hi.predict(val)
coverage = ((y_val >= y_hat_lo) & (y_val <= y_hat_hi)).mean()
print(f"Validation coverage: {coverage:.3%}")   # you want ≥ 0.88-0.92


Validation coverage: 86.472%
