In [14]:
import pandas as pd
import numpy as np

In [15]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
interactions = pd.read_csv("interactions.csv")


In [16]:
## Filter interactions to only include those with days_before_service >= 15
interactions_filt = interactions[
    interactions["days_before_service"] >= 15
].copy()


In [17]:
GROUP_COLS = [
    "service_date",
    "origin_hub_id",
    "destination_hub_id"
]


In [18]:
agg_df = interactions_filt.groupby(GROUP_COLS).agg(
    # strongest signals
    max_commitments=("cumulative_commitments", "max"),
    max_interest=("cumulative_interest_signals", "max"),

    # early behaviour
    mean_commitments=("cumulative_commitments", "mean"),
    mean_interest=("cumulative_interest_signals", "mean"),

    # variability
    std_commitments=("cumulative_commitments", "std"),
    std_interest=("cumulative_interest_signals", "std"),

    # categorical (take first, since same per service)
    origin_region=("origin_region", "first"),
    destination_region=("destination_region", "first"),
    origin_hub_tier=("origin_hub_tier", "first"),
    destination_hub_tier=("destination_hub_tier", "first"),
).reset_index()


In [19]:
agg_df["interest_to_commit_ratio"] = (
    agg_df["max_interest"] / (agg_df["max_commitments"] + 1)
)


In [37]:
train_feat = train.merge(
    agg_df,
    on=GROUP_COLS,
    how="left"
)


In [21]:
train_feat.shape
train_feat.head()


Unnamed: 0,service_date,origin_hub_id,destination_hub_id,final_service_units,max_commitments,max_interest,mean_commitments,mean_interest,std_commitments,std_interest,origin_region,destination_region,origin_hub_tier,destination_hub_tier,interest_to_commit_ratio
0,01-03-2023,45,46,2838,16.0,480.0,11.375,255.625,3.324154,128.181577,Karnataka,Tamil Nadu,Tier 1,Tier 1,28.235294
1,01-03-2023,46,45,2298,34.0,352.0,22.875,184.75,8.546929,95.838406,Tamil Nadu,Karnataka,Tier 1,Tier 1,10.057143
2,01-03-2023,45,47,2720,36.0,892.0,15.125,454.75,9.061825,232.324916,Karnataka,Andhra Pradesh,Tier 1,Tier 1,24.108108
3,01-03-2023,47,45,2580,18.0,1130.0,6.375,549.0,6.249,362.001105,Andhra Pradesh,Karnataka,Tier 1,Tier 1,59.473684
4,01-03-2023,46,9,4185,48.0,1023.0,22.5,427.875,13.725888,248.37361,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,20.877551


In [22]:
train_feat.isna().sum()

service_date                    0
origin_hub_id                   0
destination_hub_id              0
final_service_units             0
max_commitments             33300
max_interest                33300
mean_commitments            33300
mean_interest               33300
std_commitments             33300
std_interest                33300
origin_region               33300
destination_region          33300
origin_hub_tier             33300
destination_hub_tier        33300
interest_to_commit_ratio    33300
dtype: int64

In [38]:
# Fill missing values with mean for numerical columns
numerical_cols = [
    'max_commitments', 'max_interest', 'mean_commitments', 'mean_interest',
    'std_commitments', 'std_interest', 'interest_to_commit_ratio'
]

for col in numerical_cols:
    train_feat[col] = train_feat[col].fillna(train_feat[col].mean())

# For categorical columns, fill with mode
categorical_cols = ['origin_region', 'destination_region', 'origin_hub_tier', 'destination_hub_tier']

for col in categorical_cols:
    mode_val = train_feat[col].mode()[0]
    train_feat[col] = train_feat[col].fillna(mode_val)

# Check NaNs again
train_feat.isna().sum()

service_date                0
origin_hub_id               0
destination_hub_id          0
final_service_units         0
max_commitments             0
max_interest                0
mean_commitments            0
mean_interest               0
std_commitments             0
std_interest                0
origin_region               0
destination_region          0
origin_hub_tier             0
destination_hub_tier        0
interest_to_commit_ratio    0
dtype: int64

In [24]:
# Check unique values for categorical columns
print("Origin regions:", train_feat['origin_region'].unique())
print("Destination regions:", train_feat['destination_region'].unique())
print("Origin tiers:", train_feat['origin_hub_tier'].unique())
print("Destination tiers:", train_feat['destination_hub_tier'].unique())

Origin regions: ['Karnataka' 'Tamil Nadu' 'Andhra Pradesh' 'Maharashtra and Goa'
 'Rest of North' 'Madhya Pradesh' 'Delhi' 'Rajasthan' 'East 1' 'Kerala']
Destination regions: ['Tamil Nadu' 'Karnataka' 'Andhra Pradesh' 'Maharashtra and Goa' 'Delhi'
 'Madhya Pradesh' 'Rest of North' 'East 1' 'Rajasthan' 'Kerala']
Origin tiers: ['Tier 1' 'Tier2' 'Tier 3' 'Tier 4']
Destination tiers: ['Tier 1' 'Tier2' 'Tier 3' 'Tier 4']


In [39]:
# Parse service_date
train_feat['service_date'] = pd.to_datetime(train_feat['service_date'], format='%d-%m-%Y')

# Extract date features
train_feat['month'] = train_feat['service_date'].dt.month
train_feat['day'] = train_feat['service_date'].dt.day
train_feat['weekday'] = train_feat['service_date'].dt.weekday  # 0=Monday
train_feat['is_weekend'] = train_feat['weekday'].isin([5,6]).astype(int)
train_feat['day_of_year'] = train_feat['service_date'].dt.dayofyear
train_feat['week_of_year'] = train_feat['service_date'].dt.isocalendar().week

# Add hub-level averages
hub_avg = train.groupby('origin_hub_id')['final_service_units'].mean().reset_index()
hub_avg = hub_avg.rename(columns={'final_service_units': 'origin_avg_demand'})
train_feat = train_feat.merge(hub_avg, on='origin_hub_id', how='left')

hub_avg_dest = train.groupby('destination_hub_id')['final_service_units'].mean().reset_index()
hub_avg_dest = hub_avg_dest.rename(columns={'final_service_units': 'dest_avg_demand'})
train_feat = train_feat.merge(hub_avg_dest, on='destination_hub_id', how='left')

# Drop service_date
train_feat = train_feat.drop('service_date', axis=1)

In [40]:
# One-hot encode categorical columns
train_feat = pd.get_dummies(train_feat, columns=categorical_cols, drop_first=True)

# Check shape
train_feat.shape

(67200, 42)

In [41]:
# Prepare features and target
X = train_feat.drop('final_service_units', axis=1)
y = train_feat['final_service_units']

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (67200, 41)
y shape: (67200,)


In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for CatBoost
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

model = CatBoostRegressor(verbose=0, early_stopping_rounds=50)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train, eval_set=(X_val, y_val))

best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

# Predict on val
y_pred = best_model.predict(X_val)

In [33]:
# Evaluate
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"MAE: {mae}, RMSE: {rmse}")

MAE: 331.3253321248519, RMSE: 491.2202602110357


In [35]:
# Baseline: predict mean
y_mean = y_train.mean()
y_pred_baseline = np.full_like(y_val, y_mean)
mae_baseline = mean_absolute_error(y_val, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_val, y_pred_baseline))
print(f"Baseline MAE: {mae_baseline}, RMSE: {rmse_baseline}")
print(f"Improvement over baseline: MAE {mae_baseline - mae:.2f}, RMSE {rmse_baseline - rmse:.2f}")

Baseline MAE: 860.4665922619048, RMSE: 1199.9904245339687
Improvement over baseline: MAE 529.14, RMSE 708.77


In [31]:
# Prepare test features
test_feat = test.merge(
    agg_df,
    on=GROUP_COLS,
    how="left"
)

# Fill NaNs same way
for col in numerical_cols:
    test_feat[col] = test_feat[col].fillna(train_feat[col].mean())  # Use train means

# For categoricals, use mode from original train
train_original = train.merge(
    agg_df,
    on=GROUP_COLS,
    how="left"
)
for col in numerical_cols:
    train_original[col] = train_original[col].fillna(train_original[col].mean())
for col in categorical_cols:
    mode_val = train_original[col].mode()[0]
    test_feat[col] = test_feat[col].fillna(mode_val)

# Parse date
test_feat['service_date'] = pd.to_datetime(test_feat['service_date'], format='%d-%m-%Y')

# Extract features
test_feat['month'] = test_feat['service_date'].dt.month
test_feat['day'] = test_feat['service_date'].dt.day
test_feat['weekday'] = test_feat['service_date'].dt.weekday
test_feat['is_weekend'] = test_feat['weekday'].isin([5,6]).astype(int)

# Drop service_date
test_feat = test_feat.drop('service_date', axis=1)

# One-hot encode
test_feat = pd.get_dummies(test_feat, columns=categorical_cols, drop_first=True)

# Ensure same columns as train
missing_cols = set(X.columns) - set(test_feat.columns)
for col in missing_cols:
    test_feat[col] = 0

extra_cols = set(test_feat.columns) - set(X.columns) - {'service_key'}
test_feat = test_feat.drop(extra_cols, axis=1)

# Reorder columns
test_feat = test_feat[X.columns]

# Predict
test_pred = model.predict(test_feat)

# Create submission
submission = pd.DataFrame({
    'service_key': test['service_key'],
    'final_service_units': test_pred
})

submission.to_csv('submission_echobyte.csv', index=False)