In [4]:
!pip install pandas numpy catboost xgboost lightgbm scikit-learn tqdm holidays matplotlib seaborn


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost, lightgbm
Successfully installed lightgbm-4.6.0 xgboost-3.0.2


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import holidays

def create_features():
    # Load datasets
    train = pd.read_csv('train.csv', parse_dates=['doj'])
    test = pd.read_csv('test.csv', parse_dates=['doj'])
    transactions = pd.read_csv('transactions.csv', parse_dates=['doj', 'doi'])

    # 1. Booking Curve Features
    booking_features = []
    for (doj, srcid, destid), group in transactions.groupby(['doj', 'srcid', 'destid']):
        feature_row = {'doj': doj, 'srcid': srcid, 'destid': destid}
        for dbd in [15, 20, 25, 30]:
            dbd_data = group[group['dbd'] == dbd]
            feature_row[f'seats_dbd{dbd}'] = dbd_data['cumsum_seatcount'].iloc[0] if not dbd_data.empty else 0
            feature_row[f'searches_dbd{dbd}'] = dbd_data['cumsum_searchcount'].iloc[0] if not dbd_data.empty else 0
        booking_features.append(feature_row)
    booking_df = pd.DataFrame(booking_features)

    # Merge with main data
    train = train.merge(booking_df, on=['doj', 'srcid', 'destid'], how='left')
    test = test.merge(booking_df, on=['doj', 'srcid', 'destid'], how='left')

    # 2. Temporal Features
    for df in [train, test]:
        df['day_of_week'] = df['doj'].dt.dayofweek
        df['month'] = df['doj'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['doy_sin'] = np.sin(2 * np.pi * df['doj'].dt.dayofyear / 365)
        df['doy_cos'] = np.cos(2 * np.pi * df['doj'].dt.dayofyear / 365)

    # 3. Holiday Features
    india_holidays = holidays.India(years=[2023, 2024, 2025])
    holiday_dates = [pd.Timestamp(date) for date in india_holidays.keys()]

    for df in [train, test]:
        df['is_national_holiday'] = df['doj'].dt.date.isin(india_holidays.keys()).astype(int)
        df['days_to_next_holiday'] = df['doj'].apply(
            lambda x: min([(pd.Timestamp(h) - x).days for h in india_holidays.keys() if pd.Timestamp(h) > x], default=30)
        )
        df['days_since_last_holiday'] = df['doj'].apply(
            lambda x: min([(x - pd.Timestamp(h)).days for h in india_holidays.keys() if pd.Timestamp(h) < x], default=30)
        )
        df['holiday_proximity'] = 1 / (1 + df[['days_to_next_holiday', 'days_since_last_holiday']].min(axis=1))

    # 4. Route Features - FIXED: Encode tier columns
    route_meta = transactions.groupby(['srcid', 'destid']).first().reset_index()[
        ['srcid', 'destid', 'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
    ]

    # Merge route metadata
    train = train.merge(route_meta, on=['srcid', 'destid'], how='left')
    test = test.merge(route_meta, on=['srcid', 'destid'], how='left')

    # Convert tier columns to numerical categories
    for df in [train, test]:
        # Extract tier number from strings like "Tier 1"
        df['src_tier_num'] = df['srcid_tier'].str.extract('(\d+)').astype(float)
        df['dest_tier_num'] = df['destid_tier'].str.extract('(\d+)').astype(float)

        df['same_region'] = (df['srcid_region'] == df['destid_region']).astype(int)
        df['tier_combination'] = df['src_tier_num'].astype(str) + '_' + df['dest_tier_num'].astype(str)
        df['is_metro_route'] = ((df['src_tier_num'] == 1) & (df['dest_tier_num'] == 1)).astype(int)

    # 5. Derived Features
    for df in [train, test]:
        # Booking metrics
        df['velocity_30_15'] = (df['seats_dbd15'] - df['seats_dbd30']) / 15
        df['velocity_25_15'] = (df['seats_dbd15'] - df['seats_dbd25']) / 10
        df['conversion_rate_15'] = df['seats_dbd15'] / (df['searches_dbd15'] + 1)
        df['conversion_delta'] = df['conversion_rate_15'] - (df['seats_dbd30'] / (df['searches_dbd30'] + 1))

        # Booking stability
        seats_cols = ['seats_dbd15', 'seats_dbd20', 'seats_dbd25', 'seats_dbd30']
        df['booking_stability'] = df[seats_cols].std(axis=1) / (df[seats_cols].mean(axis=1) + 1e-5)

        # Interactions
        df['metro_holiday'] = df['is_metro_route'] * df['is_national_holiday']

        # Booking curve shape
        X = np.array([30, 25, 20, 15])
        def curve_slope(row):
            y = [row[f'seats_dbd{dbd}'] for dbd in [30, 25, 20, 15]]
            return np.polyfit(X, y, 1)[0]
        df['booking_slope'] = df.apply(curve_slope, axis=1)

    # 6. Historical Features
    train_sorted = train.sort_values('doj').copy()
    train_sorted['hist_avg'] = train_sorted.groupby(['srcid', 'destid'])['final_seatcount'].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    train_sorted['hist_max'] = train_sorted.groupby(['srcid', 'destid'])['final_seatcount'].transform(
        lambda x: x.expanding().max().shift(1)
    )

    # For test data
    full_hist = train_sorted.groupby(['srcid', 'destid']).agg(
        hist_avg=('final_seatcount', 'mean'),
        hist_max=('final_seatcount', 'max')
    ).reset_index()

    test = test.merge(full_hist, on=['srcid', 'destid'], how='left')
    train = train_sorted

    # Early fill ratio
    train['early_fill_ratio'] = train['seats_dbd15'] / (train['hist_max'] + 1)
    test['early_fill_ratio'] = test['seats_dbd15'] / (test['hist_max'] + 1)

    # Handle missing values
    seat_search_cols = [c for c in train.columns if c.startswith('seats_') or c.startswith('searches_')]
    for col in seat_search_cols:
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    for col in ['hist_avg', 'hist_max']:
        global_avg = train[col].mean()
        train[col].fillna(global_avg, inplace=True)
        test[col].fillna(global_avg, inplace=True)

    # Encode categoricals
    cat_cols = ['srcid_region', 'destid_region', 'tier_combination']
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train[col], test[col]], axis=0))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

    # Drop original tier columns
    for df in [train, test]:
        df.drop(['srcid_tier', 'destid_tier'], axis=1, inplace=True, errors='ignore')

    # Save processed data
    train.to_csv('train_processed.csv', index=False)
    test.to_csv('test_processed.csv', index=False)

    print("Feature engineering complete! Processed data saved.")
    return train, test

# Run feature engineering
train_proc, test_proc = create_features()

  df['src_tier_num'] = df['srcid_tier'].str.extract('(\d+)').astype(float)
  df['dest_tier_num'] = df['destid_tier'].str.extract('(\d+)').astype(float)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(0, inplace=True)
The behavio

Feature engineering complete! Processed data saved.


In [1]:
# xgb_fixedparams_with_cv.py
# ------------------------------------------------------------------
# ⋅ Leak-free date split
# ⋅ One-hot for categoricals
# ⋅ Uses fixed best parameters from tuning
# ⋅ Rolling time-series CV
# ⋅ Trains on full training data
# ⋅ Generates xgb.csv
# ------------------------------------------------------------------
import os
import random
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# ─── Reproducibility ─────────────────────────────────────────────
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

warnings.filterwarnings("ignore")

# ─── GPU Availability Check ──────────────────────────────────────
def check_gpu_support():
    try:
        X = np.array([[0,0],[1,1]], dtype=np.float32)
        y = np.array([0,1], dtype=np.float32)
        params = {'tree_method': 'gpu_hist', 'gpu_id': 0, 'objective': 'reg:squarederror'}
        xgb.train(params, xgb.DMatrix(X, y), num_boost_round=1)
        return True
    except Exception:
        return False

USE_GPU = check_gpu_support()
print(f"★ GPU acceleration available: {USE_GPU}")

TRAIN_CSV = "train_processed.csv"
TEST_CSV  = "test_processed.csv"

# ─── 1. Load & Preprocess Data ───────────────────────────────────
df_train = pd.read_csv(TRAIN_CSV, parse_dates=["doj"]).sort_values("doj")
df_test  = pd.read_csv(TEST_CSV,  parse_dates=["doj"]).sort_values("doj")
test_keys = df_test["route_key"].copy()

def add_date(df):
    out = df.copy()
    out["day"]   = out["doj"].dt.day
    out["month"] = out["doj"].dt.month
    out["year"]  = out["doj"].dt.year
    return out.drop(columns="doj")

df_train = add_date(df_train)
df_test  = add_date(df_test)
df_train.drop(columns="route_key", inplace=True, errors="ignore")
df_test .drop(columns="route_key", inplace=True, errors="ignore")

TARGET  = "final_seatcount"
FEATS   = [c for c in df_train.columns if c != TARGET]
cat_cols = df_train[FEATS].select_dtypes("object").columns.tolist()

# ─── 2. One-Hot Encoding (Combined Train+Test) ──────────────────
df_comb = pd.concat([df_train[FEATS], df_test[FEATS]], axis=0)
df_comb = pd.get_dummies(df_comb, columns=cat_cols, drop_first=True)

X_train = df_comb.iloc[:len(df_train)].reset_index(drop=True)
X_test  = df_comb.iloc[len(df_train):].reset_index(drop=True)
y_train = df_train[TARGET]

# ─── 3. Fixed Best Parameters ───────────────────────────────────
BEST_PARAMS = {
    'subsample': 1.0,
    'reg_lambda': 1,
    'reg_alpha': 0.1,
    'n_estimators': 249,
    'min_child_weight': 3,
    'max_depth': 8,
    'learning_rate': 0.1,
    'gamma': 0.3,
    'colsample_bytree': 0.7
}

xgb_params = {
    'eta':               BEST_PARAMS['learning_rate'],
    'max_depth':         BEST_PARAMS['max_depth'],
    'min_child_weight':  BEST_PARAMS['min_child_weight'],
    'subsample':         BEST_PARAMS['subsample'],
    'colsample_bytree':  BEST_PARAMS['colsample_bytree'],
    'gamma':             BEST_PARAMS['gamma'],
    'alpha':             BEST_PARAMS['reg_alpha'],
    'lambda':            BEST_PARAMS['reg_lambda'],
    'objective':         'reg:squarederror',
    'seed':              SEED,
}

if USE_GPU:
    xgb_params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
else:
    xgb_params['tree_method'] = 'hist'

# ─── 4. Rolling Time-Series CV ───────────────────────────────────
print("\nStarting rolling window (time-series) cross-validation...")
tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    dtrain_cv = xgb.DMatrix(X_tr, label=y_tr)
    dval_cv   = xgb.DMatrix(X_val, label=y_val)
    cv_model = xgb.train(
        xgb_params,
        dtrain_cv,
        num_boost_round=BEST_PARAMS['n_estimators']
    )
    preds = cv_model.predict(dval_cv)
    rmse  = np.sqrt(mean_squared_error(y_val, preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}")
    rmse_scores.append(rmse)
print(f"Mean CV RMSE: {np.mean(rmse_scores):.4f}\n")

# ─── 5. Train Final Model & Submit ──────────────────────────────
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test)

print("Training final model on full data...")
final_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=BEST_PARAMS['n_estimators']
)

pred_test = final_model.predict(dtest)
pd.DataFrame({
    "route_key": test_keys,
    "final_seatcount": pred_test
}).to_csv("xgb.csv", index=False)

print("★ xgb.csv written ✓")
print("★ Params:", BEST_PARAMS)


★ GPU acceleration available: True

Starting rolling window (time-series) cross-validation...
Fold 1 RMSE: 614.1209
Fold 2 RMSE: 715.8667
Fold 3 RMSE: 473.0987
Fold 4 RMSE: 446.6334
Fold 5 RMSE: 588.1078
Mean CV RMSE: 567.5655

Training final model on full data...
★ xgb.csv written ✓
★ Params: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 249, 'min_child_weight': 3, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 0.7}


In [2]:
# lgbm_final.py
# ------------------------------------------------------------------
# ⋅ Uses provided best parameters
# ⋅ Rolling time-series CV
# ⋅ Trains on full dataset
# ⋅ Predicts test set → lgbm.csv
# ------------------------------------------------------------------
import os
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import warnings

# ─── Reproducibility ─────────────────────────────────────────────
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

warnings.filterwarnings("ignore")

TRAIN_CSV = "train_processed.csv"
TEST_CSV  = "test_processed.csv"

# ─── 1. Load & Prepare Data ─────────────────────────────────────
df_train = pd.read_csv(TRAIN_CSV, parse_dates=["doj"]).sort_values("doj")
df_test  = pd.read_csv(TEST_CSV,  parse_dates=["doj"]).sort_values("doj")
test_keys = df_test["route_key"].copy()

def add_date(df):
    out = df.copy()
    out["day"]   = out["doj"].dt.day
    out["month"] = out["doj"].dt.month
    out["year"]  = out["doj"].dt.year
    return out.drop(columns="doj")

df_train = add_date(df_train)
df_test  = add_date(df_test)
for df in (df_train, df_test):
    df.drop(columns="route_key", inplace=True, errors="ignore")

TARGET  = "final_seatcount"
FEATS   = [c for c in df_train.columns if c != TARGET]

# ─── 2. One-Hot Encode ───────────────────────────────────────────
cat_cols = df_train[FEATS].select_dtypes("object").columns.tolist()
df_comb  = pd.concat([df_train[FEATS], df_test[FEATS]], axis=0)
df_comb  = pd.get_dummies(df_comb, columns=cat_cols, drop_first=True)

X_train = df_comb.iloc[:len(df_train)].reset_index(drop=True)
X_test  = df_comb.iloc[len(df_train):].reset_index(drop=True)
y_train = df_train[TARGET]

# ─── 3. Rolling Time-Series CV ──────────────────────────────────
best_params = {
    'subsample':         0.8,
    'reg_lambda':        0.1,
    'reg_alpha':         5,
    'num_leaves':        40,
    'min_split_gain':    0.3,
    'min_child_samples': 35,
    'max_depth':         9,
    'learning_rate':     0.07,
    'colsample_bytree':  0.9,
}

print("Starting rolling window (time-series) CV for LGBM…")
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMRegressor(
        **best_params,
        n_estimators=462,
        random_state=SEED,
        n_jobs=-1,
        verbose=-1
    )
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    rmse  = np.sqrt(mean_squared_error(y_val, preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}")
    cv_scores.append(rmse)

print(f"Mean CV RMSE: {np.mean(cv_scores):.4f}\n")

# ─── 4. Train on Full Data & Submit ─────────────────────────────
print("Training LGBM on full data…")
final_model = lgb.LGBMRegressor(
    **best_params,
    n_estimators=462,
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)
final_model.fit(X_train, y_train)

pred_test = final_model.predict(X_test)
pd.DataFrame({
    "route_key": test_keys,
    "final_seatcount": pred_test
}).to_csv("lgbm.csv", index=False)

print("✅ lgbm.csv written ✓")


Starting rolling window (time-series) CV for LGBM…
Fold 1 RMSE: 592.7637
Fold 2 RMSE: 712.7851
Fold 3 RMSE: 470.0131
Fold 4 RMSE: 432.7165
Fold 5 RMSE: 579.9082
Mean CV RMSE: 557.6373

Training LGBM on full data…
✅ lgbm.csv written ✓


In [3]:
# rf_final.py
# ------------------------------------------------------------------
# ⋅ RandomForest with fixed best parameters
# ⋅ Rolling time-series CV
# ⋅ Trains on full dataset
# ⋅ Predicts test set → rf.csv
# ------------------------------------------------------------------
import os
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import warnings

# ─── Reproducibility ─────────────────────────────────────────────
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

warnings.filterwarnings("ignore")

TRAIN_CSV = "train_processed.csv"
TEST_CSV  = "test_processed.csv"

# ─── 1. Load & Preprocess ───────────────────────────────────────
df_train = pd.read_csv(TRAIN_CSV, parse_dates=["doj"])
df_test  = pd.read_csv(TEST_CSV,  parse_dates=["doj"])
test_keys = df_test["route_key"].copy()

def add_date(df):
    out = df.copy()
    out["day"]   = out["doj"].dt.day
    out["month"] = out["doj"].dt.month
    out["year"]  = out["doj"].dt.year
    return out.drop(columns="doj")

df_train = add_date(df_train)
df_test  = add_date(df_test)
df_train.drop(columns="route_key", inplace=True, errors="ignore")
df_test .drop(columns="route_key", inplace=True, errors="ignore")

TARGET = "final_seatcount"
FEATS  = [c for c in df_train.columns if c != TARGET]

# ─── 2. One-Hot Encode ───────────────────────────────────────────
cat_cols = df_train[FEATS].select_dtypes("object").columns.tolist()
df_comb  = pd.concat([df_train[FEATS], df_test[FEATS]], axis=0)
df_comb  = pd.get_dummies(df_comb, columns=cat_cols, drop_first=True)

X_train = df_comb.iloc[:len(df_train)].reset_index(drop=True)
X_test  = df_comb.iloc[len(df_train):].reset_index(drop=True)
y_train = df_train[TARGET]

# ─── 3. Rolling Time-Series CV ──────────────────────────────────
best_params = {
    'n_estimators':       1000,
    'min_samples_split':  2,
    'min_samples_leaf':   1,
    'max_features':       'sqrt',
    'max_depth':          30,
    'bootstrap':          False
}

print("Starting rolling window (time-series) CV for RF…")
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = RandomForestRegressor(
        **best_params,
        n_jobs=-1,
        random_state=SEED
    )
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    rmse  = np.sqrt(mean_squared_error(y_val, preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}")
    cv_scores.append(rmse)

print(f"Mean CV RMSE: {np.mean(cv_scores):.4f}\n")

# ─── 4. Train on Full Data & Submit ─────────────────────────────
print("Training RF on full data…")
rf = RandomForestRegressor(
    **best_params,
    n_jobs=-1,
    random_state=SEED
)
rf.fit(X_train, y_train)

pred_test = rf.predict(X_test)
pd.DataFrame({
    "route_key": test_keys,
    "final_seatcount": np.round(pred_test).clip(min=10)
}).to_csv("rf.csv", index=False)

print("✅ rf.csv written ✓")


Starting rolling window (time-series) CV for RF…
Fold 1 RMSE: 654.2618
Fold 2 RMSE: 710.9713
Fold 3 RMSE: 495.8746
Fold 4 RMSE: 486.8650
Fold 5 RMSE: 610.7114
Mean CV RMSE: 591.7368

Training RF on full data…
✅ rf.csv written ✓


In [None]:
import pandas as pd
import numpy as np

# 1) Load each submission
rf  = pd.read_csv(r"rf.csv")
xgb = pd.read_csv(r"xgb.csv")
lgb = pd.read_csv(r"lgbm.csv")

# 2) Merge them on route_key
df = (
    rf[['route_key', 'final_seatcount']]
    .rename(columns={'final_seatcount':'rf'})
    .merge(
        xgb[['route_key', 'final_seatcount']].rename(columns={'final_seatcount':'xgb'}),
        on='route_key',
        how='inner'
    )
    .merge(
        lgb[['route_key', 'final_seatcount']].rename(columns={'final_seatcount':'lgb'}),
        on='route_key',
        how='inner'
    )
)

# 3) Compute median across the three predictions
df['final_seatcount'] = df[['rf','xgb','lgb']].mean(axis=1)

# 4) Clip to at least 10, round to nearest int, and cast
df['final_seatcount'] = np.rint(df['final_seatcount'].clip(lower=10)).astype(int)

# 5) Save the final ensemble
df[['route_key','final_seatcount']].to_csv(r"submission.csv", index=False)
print("submission.csv written ")


submission.csv written 
