# Medical Equipments Cost Prediction - Boosting Model

This notebook demonstrates data ingestion, preprocessing, and regression modeling using boosting methods.

## 1. Imports & Setup

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


## 2. Data Loading

In [5]:
# Read CSVs and parse dates where available
train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
print('Train shape:', train.shape)
print('Test shape:', test.shape)
train.head()

  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  train = pd.read_csv('../data/train.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)


Train shape: (5000, 20)
Test shape: (500, 19)


  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)
  test = pd.read_csv('../data/test.csv', parse_dates=['Order_Placed_Date','Delivery_Date'], infer_datetime_format=True)


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,2017-10-20,2017-10-20,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,2016-02-22,2016-02-24,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,2018-01-11,2018-01-10,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,2016-08-06,2016-08-06,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,2016-12-15,2016-12-17,"Joshuamouth, AK 01550",8553.52


## 3. Preprocessing

In [6]:
# Preprocessing tailored to the provided CSVs
target_col = 'Transport_Cost'
id_col = 'Hospital_Id'
# Create delivery delay feature from dates (if present)
for df in [train, test]:
    if 'Order_Placed_Date' in df.columns:
        df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    if 'Delivery_Date' in df.columns:
        df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    if ('Order_Placed_Date' in df.columns) and ('Delivery_Date' in df.columns):
        df['delivery_delay'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days

# Map Yes/No columns to 0/1 where applicable
yesno_cols = ['CrossBorder_Shipping','Urgent_Shipping','Installation_Service','Fragile_Equipment','Rural_Hospital']
for col in yesno_cols:
    if col in train.columns:
        for df in [train, test]:
            if col in df.columns:
                df[col] = df[col].map({'Yes':1,'No':0}).fillna(0).astype(int)

# Prepare X, y and X_test aligned to training features
y = train[target_col].copy()
# Keep Hospital_Id in test for submission; drop from training features
X = train.drop([target_col], axis=1)
# Drop columns that are identifiers or location text which won't be one-hot encoded (we keep Hospital_Id for submission)
drop_cols = ['Hospital_Location','Hospital_Info']
X = X.drop([c for c in drop_cols if c in X.columns], axis=1)
X_test = test.reindex(columns=X.columns).copy()  # align columns; missing ones will be NaN

# Separate numeric and categorical columns based on train features
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
# ensure date columns are not treated as categoricals
for dcol in ['Order_Placed_Date','Delivery_Date']:
    if dcol in numeric_cols: numeric_cols.remove(dcol)
    if dcol in categorical_cols: categorical_cols.remove(dcol)

# Preprocessing pipeline
numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Full pipeline with Gradient Boosting
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

## 4. Train/Validation Split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Model Training (Gradient Boosting)

In [12]:
# Build a Pipeline that first applies preprocessing then the regressor
from sklearn.pipeline import Pipeline as SKPipeline
regressor = GradientBoostingRegressor(n_estimators=400, learning_rate=0.1, random_state=42)
model = SKPipeline(steps=[('preprocessor', preprocessor), ('regressor', regressor)])
model.set_params(regressor__learning_rate=0.1,
                 regressor__max_depth=3,
                 regressor__min_samples_split=2,
                 regressor__n_estimators=200)

# Fit pipeline on DataFrame X_train / y_train - pipeline will transform X_train internally
model.fit(X_train, y_train)


In [9]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid to search
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10]
}

# Setup GridSearchCV with your existing pipeline and param grid
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

# Validation with best model
val_preds = best_model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
r2 = r2_score(y_val, val_preds)

print("Best parameters:", grid_search.best_params_)
print(f"Validation MSE after tuning: {mse:.4f}")
print(f"Validation R² after tuning: {r2:.4f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Validation MSE after tuning: 1263502879.8257
Validation R² after tuning: 0.4304


## 6. Model Evaluation

In [14]:
# Evaluate on validation set (pipeline will preprocess X_val)
val_preds = model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
r2 = r2_score(y_val, val_preds)
print(f'Validation MSE: {mse:.4f}')
print(f'Validation R²: {r2:.4f}')


Validation MSE: 1263502879.8257
Validation R²: 0.4304


## 7. Test Prediction and Submission

In [16]:
# Predict on test set and prepare submission (pipeline will preprocess X_test)
test_preds = model.predict(X_test)
submission = pd.DataFrame({id_col: test[id_col].values, 'Transport_Cost': test_preds})
submission.to_csv('boosting_submission.csv', index=False)
print('Submission file saved: boosting_submission.csv')
submission.head()


Submission file saved: boosting_submission.csv


Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,1081.161315
1,fffe3700330036003600,1081.161315
2,fffe3300390038003400,1277.987123
3,fffe310030003900,1081.161315
4,fffe3700330031003200,1081.161315
