In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/sample_submission.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error



CLIP_FLOOR = 1.0        #Transpost costs below 1 are clipped to 1
FREQ_THRESHOLD = 0.01   # Used to find rare categories
LEARNING_RATE = 0.02    # Slightly higher than 0.01 to speed up convergence
N_ESTIMATORS = 1500     # Increased number of trees for lower bias


train_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv")
test_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv")

test_ids = test_df['Hospital_Id']
train_df.set_index('Hospital_Id', inplace=True)
test_df.set_index('Hospital_Id', inplace=True)

#Defining X_train ,y_train,X_test so that model can be trained on training data and tested on X_test 
y_train=train_df['Transport_Cost'].copy()
X_train = train_df.drop(columns=["Transport_Cost"]).copy()
X_test = test_df.copy()

#Combining training and testing dataframes for preprocessing
combined_df = pd.concat([X_train, X_test], axis=0)

#Log transformation on clipped transport costs
y_train_transformed = np.log(y_train.clip(lower=CLIP_FLOOR))

#Dropping unnecesary columns and adding new columns with important information and doing some formatting
combined_df['Order_Placed_Date'] = pd.to_datetime(combined_df['Order_Placed_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Date'] = pd.to_datetime(combined_df['Delivery_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Lag_Days'] = (combined_df['Delivery_Date'] - combined_df['Order_Placed_Date']).dt.days.fillna(0).astype(int)
combined_df['Order_Day_of_Week'] = combined_df['Order_Placed_Date'].dt.dayofweek
combined_df['Order_Month'] = combined_df['Order_Placed_Date'].dt.month

# Equipment related new features
combined_df['Equipment_Volume'] = combined_df['Equipment_Height'] * combined_df['Equipment_Width']
combined_df['Equipment_Density'] = combined_df['Equipment_Weight'] / (combined_df['Equipment_Volume'] + 1e-6) #1e-6 added to handle 0 volume

# Drop original dates and low-utility features
combined_df.drop(columns=['Order_Placed_Date', 'Delivery_Date', 'Supplier_Name', 'Hospital_Location'], inplace=True, errors='ignore')

#For binary columns map to 1 or 0 according to binary_map
binary_map = {'Yes': 1, 'No': 0}
binary_cols_to_map = ['CrossBorder_Shipping', 'Installation_Service', 'Rural_Hospital'] 
for col in binary_cols_to_map:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].map(binary_map).fillna(0)
    
binary_cols = binary_cols_to_map + ['Urgent_Shipping', 'Fragile_Equipment'] 

#  Group low frequency categorical features
categorical_cols_to_group = ['Equipment_Type', 'Transport_Method', 'Hospital_Info']
for col in categorical_cols_to_group:
    if col in combined_df.columns:
        train_counts = combined_df.iloc[:len(X_train)][col].value_counts(normalize=True)
        low_freq_cats = train_counts[train_counts < FREQ_THRESHOLD].index
        combined_df[col] = np.where(combined_df[col].isin(low_freq_cats), 'Other', combined_df[col])


# Re-separate the data after doing some preprocessing on both training and test data
X_train_clean = combined_df.iloc[:len(X_train)]
X_test_clean = combined_df.iloc[len(X_train):]



#Find numerical and categorical columns
numeric_cols = [col for col in X_train_clean.select_dtypes(include=np.number).columns.tolist() if col not in binary_cols]
categorical_cols = X_train_clean.select_dtypes(include=['object']).columns.tolist()
#Preprocessor pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)




# Define the xgboost base model
xgb_base = XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=N_ESTIMATORS, 
    learning_rate=LEARNING_RATE,
    random_state=42, 
    n_jobs=-1
)

tuning_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_base)
])

#Parameter grid used for searching by GridSearchCV
param_grid = {
    'regressor__max_depth': [6, 8, 10],            
    'regressor__subsample': [0.75, 0.9],           
    'regressor__colsample_bytree': [0.7, 0.9],     
    'regressor__reg_alpha': [0.001, 0.1],          # L1 regularization 
    'regressor__reg_lambda': [0.1, 1]              # L2 regularization 
}
cv = KFold(n_splits=4, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    tuning_pipeline, 
    param_grid, 
    cv=cv, 
    scoring='neg_root_mean_squared_error',
    verbose=1, 
    n_jobs=-1
)

grid_search.fit(X_train_clean, y_train_transformed)
best_params_found = grid_search.best_params_
best_score = -grid_search.best_score_
print(f"Best Cross-Validation RMSE (on log-target): {best_score:.4f}")

# Clean up parameter names and add fixed values for the final model
final_xgb_params = {k.replace('regressor__', ''): v for k, v in best_params_found.items()}
final_xgb_params['n_estimators'] = N_ESTIMATORS
final_xgb_params['learning_rate'] = LEARNING_RATE
final_xgb_params['objective'] = 'reg:squarederror'
final_xgb_params['random_state'] = 42 
final_xgb_params['n_jobs'] = -1

print("Best XGBoost parameters for final training:")
print(final_xgb_params)






# Initialize the xgboost optimised model
xgb_best_model = XGBRegressor(**final_xgb_params)

# Define final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_best_model)
])

# Fit the final pipeline on all training data
final_pipeline.fit(X_train_clean, y_train_transformed)

# Make predictions
predictions_log = final_pipeline.predict(X_test_clean)

# Inverse Transform Predictions (e^x)
predictions_original_scale = np.exp(predictions_log) 
predictions_original_scale = np.maximum(predictions_original_scale, 0)

#Submission dataframe
submission_df = pd.DataFrame({
    'Hospital_Id': test_ids,
    'Transport_Cost': predictions_original_scale
})

submission_df.to_csv("submission.csv", index=False)
print()
print("Submission file 'submission.csv' successfully created.")


Fitting 4 folds for each of 48 candidates, totalling 192 fits
Best Cross-Validation RMSE (on log-target): 2.1466
Best XGBoost parameters for final training:
{'colsample_bytree': 0.7, 'max_depth': 10, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.75, 'n_estimators': 1500, 'learning_rate': 0.02, 'objective': 'reg:squarederror', 'random_state': 42, 'n_jobs': -1}

Submission file 'submission.csv' successfully created.
