# BusDelay Starter — Baseline (RMSE)

**Что делает ноутбук:**
1. Загружает `train.csv`/`test.csv`.
2. Добавляет простые временные фичи.
3. Делает GroupKFold-валидацию по `trip_id`.
4. Обучает `RandomForestRegressor` (можно заменить).
5. Сохраняет `submission.csv`.


In [4]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error

COMPETITION_SLUG = "bus-delay-starter"

if os.path.exists(f"/kaggle/input/{COMPETITION_SLUG}"):
    DATA = f"/kaggle/input/{COMPETITION_SLUG}"
else:
    # локальный запуск: положите файлы сюда
    DATA = "data"  # измените путь при необходимости

train = pd.read_csv(f"{DATA}/train.csv")
test  = pd.read_csv(f"{DATA}/test.csv")
print(train.shape, test.shape)

(86671, 14) (38577, 13)


In [2]:
def add_time_feats(df):
    x = 2*np.pi*(df["planned_arrival_min"]/1440.0)
    df = df.copy()
    df["time_sin"], df["time_cos"] = np.sin(x), np.cos(x)
    df["hour"] = (df["planned_arrival_min"]//60 % 24).astype(int)
    return df

train = add_time_feats(train)
test  = add_time_feats(test)

target = "delay_minutes"
y = train[target].clip(lower=0)
num = ["stop_sequence","distance_run_km","headway_min","temp_c","precip_mm","wind_mps","traffic_index","time_sin","time_cos"]
cat = ["route_id","weekday","is_holiday"]
X = train[num+cat]
X_test = test[num+cat]
groups = train["trip_id"]

In [6]:
pre = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat)
], remainder="passthrough")

model = Pipeline([
    ("pre", pre),
    ("rf", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

cv = GroupKFold(n_splits=5)
oof = np.zeros(len(X))
for fold, (tr, va) in enumerate(cv.split(X, y, groups=groups), 1):
    model.fit(X.iloc[tr], y.iloc[tr])
    oof[va] = model.predict(X.iloc[va])
    rmse = root_mean_squared_error(y.iloc[va], oof[va])
    print(f"Fold {fold} RMSE: {rmse:.4f}")
print("CV RMSE:", root_mean_squared_error(y, oof))

Fold 1 RMSE: 1.5088
Fold 2 RMSE: 1.5276
Fold 3 RMSE: 1.5166
Fold 4 RMSE: 1.5090
Fold 5 RMSE: 1.5128
CV RMSE: 1.5149500127820297


In [None]:
model.fit(X, y)
pred = np.clip(model.predict(X_test), 0, None)
sub = pd.DataFrame({"id": test["id"], "delay_minutes": pred})
sub_path = "submission.csv"
sub.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub.head()