# XGBoost Model for Medical Equipments Cost Prediction

This notebook loads the dataset, preprocesses it, trains an XGBoost regressor, evaluates model performance, and generates predictions for the test set.

## 1. Import Libraries

In [22]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import xgboost as xgb

## 2. Load and Inspect Data

In [23]:
train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
print('Train shape:', train.shape)
print('Test shape:', test.shape)
train.head()

  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)


Train shape: (5000, 20)
Test shape: (500, 19)


  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,2017-10-20,2017-10-20,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,2016-02-22,2016-02-24,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,2018-01-11,2018-01-10,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,2016-08-06,2016-08-06,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,2016-12-15,2016-12-17,"Joshuamouth, AK 01550",8553.52


## 3. Feature Engineering

In [24]:
# Process dates with proper handling of invalid date pairs
def process_dates_with_swap(df):
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # Identify and swap invalid date pairs (order date after delivery)
    invalid_dates_mask = df['Order_Placed_Date'] > df['Delivery_Date']
    df.loc[invalid_dates_mask, ['Order_Placed_Date', 'Delivery_Date']] = \
        df.loc[invalid_dates_mask, ['Delivery_Date', 'Order_Placed_Date']].values

    # Calculate delivery duration in days
    df['delivery_delay'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days

    # Replace invalid/missing durations with mean positive duration
    mean_duration = df.loc[df['delivery_delay'] > 0, 'delivery_delay'].mean()
    df.loc[df['delivery_delay'] <= 0, 'delivery_delay'] = mean_duration
    df['delivery_delay'].fillna(mean_duration, inplace=True)

    return df

# Apply improved date processing
for df in [train, test]:
    df = process_dates_with_swap(df)

# Handle Yes/No columns
yesno_cols = ['CrossBorder_Shipping','Urgent_Shipping','Installation_Service','Fragile_Equipment','Rural_Hospital']
for col in yesno_cols:
    if col in train.columns:
        for df in [train, test]:
            df[col] = df[col].map({'Yes':1,'No':0}).fillna(0).astype(int)

# Add month and weekday features from dates
for df in [train, test]:
    df['order_month'] = df['Order_Placed_Date'].dt.month
    df['order_weekday'] = df['Order_Placed_Date'].dt.weekday
    df['delivery_month'] = df['Delivery_Date'].dt.month
    df['delivery_weekday'] = df['Delivery_Date'].dt.weekday

  df.loc[df['delivery_delay'] <= 0, 'delivery_delay'] = mean_duration
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['delivery_delay'].fillna(mean_duration, inplace=True)
  df.loc[df['delivery_delay'] <= 0, 'delivery_delay'] = mean_duration
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['delivery_delay'].fillna(mean_du

## 4. Prepare Features and Target

In [25]:
target_col = 'Transport_Cost'
id_col = 'Hospital_Id'

# Handle negative and zero transport costs in training data
if target_col in train.columns:
    # Replace negative costs with zero
    train.loc[train[target_col] < 0, target_col] = 0
    
    # Replace zeros with mean of positive costs
    mean_positive_cost = train.loc[train[target_col] > 0, target_col].mean()
    train.loc[train[target_col] == 0, target_col] = mean_positive_cost

y = train[target_col].copy()
X = train.drop([target_col], axis=1)

# Drop non-predictive columns
drop_cols = [
    'Hospital_Location',  # Free text location
    'Hospital_Info',      # Additional metadata
    'Order_Placed_Date',  # Raw dates (we have derived features)
    'Delivery_Date'
]
X = X.drop([c for c in drop_cols if c in X.columns], axis=1)
X_test = test.reindex(columns=X.columns).copy()

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nFeature counts:")
print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print("\nNumeric features:", numeric_cols)
print("\nCategorical features:", categorical_cols)

# Print basic target stats
print("\nTarget statistics:")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")
print(f"Min: {y.min():.2f}")
print(f"Max: {y.max():.2f}")
print(f"Zeros: {(y == 0).sum()}")
print(f"Negative values: {(y < 0).sum()}")


Feature counts:
Numeric features: 16
Categorical features: 4

Numeric features: ['Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee', 'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Fragile_Equipment', 'Rural_Hospital', 'delivery_delay', 'order_month', 'order_weekday', 'delivery_month', 'delivery_weekday']

Categorical features: ['Hospital_Id', 'Supplier_Name', 'Equipment_Type', 'Transport_Method']

Target statistics:
Mean: 20528.70
Std: 255062.92
Min: 86.07
Max: 11143428.25
Zeros: 0
Negative values: 0


## 5. Preprocessing Pipeline

In [26]:
numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_cols), ('cat', categorical_transformer, categorical_cols)])

## Hypertuning


In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42
    ))
])

# Define hyperparameter grid
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.05,0.1,0.2],
    'regressor__max_depth': [3, 5, 6, 7],
    'regressor__subsample': [0.7,1.0]
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    model,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    refit=True
)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

# Output best parameters
print('Best parameters:', grid_search.best_params_)

# Evaluate best model on validation set
best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
r2 = r2_score(y_val, val_preds)

print(f'Validation MSE: {mse:.4f}')
print(f'Validation R²: {r2:.4f}')

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 300, 'regressor__subsample': 1.0}
Validation MSE: 1208630987.9850
Validation R²: 0.4425
Best parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 300, 'regressor__subsample': 1.0}
Validation MSE: 1208630987.9850
Validation R²: 0.4425


## 6. Model: XGBoost Regressor

In [63]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=300,
        learning_rate=0.2,
        max_depth=3,
        subsample= 1
    ))
])

## 7. Train and Validate

In [64]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
val_preds = model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
r2 = r2_score(y_val, val_preds)
print(f'Validation MSE: {mse:.4f}')
print(f'Validation R²: {r2:.4f}')

Validation MSE: 1208630987.9850
Validation R²: 0.4425


## 8. Generate Test Predictions

In [65]:
test_preds = model.predict(X_test)
submission = pd.DataFrame({id_col: test[id_col], target_col: test_preds})
submission.to_csv('xgboost_submission.csv', index=False)
submission.head()

Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,3108.637451
1,fffe3700330036003600,3108.637451
2,fffe3300390038003400,3320.804932
3,fffe310030003900,2688.662842
4,fffe3700330031003200,3452.259521
