In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/sample_submission.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv


In [4]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor # The model we are now using
from sklearn.metrics import mean_squared_error


CLIP_FLOOR = 1.0        
FREQ_THRESHOLD = 0.01   
    

train_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv")
test_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv")


test_ids = test_df['Hospital_Id']
train_df.set_index('Hospital_Id', inplace=True)
test_df.set_index('Hospital_Id', inplace=True)

y_train = train_df["Transport_Cost"].copy()
X_train = train_df.drop(columns=["Transport_Cost"]).copy()
X_test = test_df.copy()

combined_df = pd.concat([X_train, X_test], axis=0)

# Log transformation on clipped transport cost
y_train_transformed = np.log(y_train.clip(lower=CLIP_FLOOR))

#Dropping unnecesary columns and including new columns with important information
combined_df['Order_Placed_Date'] = pd.to_datetime(combined_df['Order_Placed_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Date'] = pd.to_datetime(combined_df['Delivery_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Lag_Days'] = (combined_df['Delivery_Date'] - combined_df['Order_Placed_Date']).dt.days.fillna(0).astype(int)
combined_df['Equipment_Volume'] = combined_df['Equipment_Height'] * combined_df['Equipment_Width']

# Drop original dates and low-utility features
combined_df.drop(columns=['Order_Placed_Date', 'Delivery_Date', 'Supplier_Name', 'Hospital_Location'], inplace=True, errors='ignore')

# Map binary columns to 0 or 1 according to binary_map
binary_map = {'Yes': 1, 'No': 0}
binary_cols_to_map = ['CrossBorder_Shipping', 'Installation_Service', 'Rural_Hospital'] 
for col in binary_cols_to_map:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].map(binary_map).fillna(0)
    
# Add all binary columns to a list
binary_cols = binary_cols_to_map + ['Urgent_Shipping', 'Fragile_Equipment'] 

# Group Low-Frequency(rare) categories
categorical_cols_to_group = ['Equipment_Type', 'Transport_Method', 'Hospital_Info']
for col in categorical_cols_to_group:
    if col in combined_df.columns:
        train_counts = combined_df.iloc[:len(X_train)][col].value_counts(normalize=True)
        low_freq_cats = train_counts[train_counts < FREQ_THRESHOLD].index
        combined_df[col] = np.where(combined_df[col].isin(low_freq_cats), 'Other', combined_df[col])


# Re-separate the data after preprocessing into training and testing sets
X_train_clean = combined_df.iloc[:len(X_train)]
X_test_clean = combined_df.iloc[len(X_train):]



#Find numerical and categorical features
numeric_cols = [col for col in X_train_clean.select_dtypes(include=np.number).columns.tolist() if col not in binary_cols]
categorical_cols = X_train_clean.select_dtypes(include=['object']).columns.tolist()

#Creating pipeline to do preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)





# Define base Decision Tree  model
dt_base = DecisionTreeRegressor(
    random_state=42
)


tuning_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', dt_base)
])


param_grid = { 'regressor__max_depth': [5, 10, 15, 20, 25, 30], 'regressor__min_samples_split': [2, 5] }

# Cross-validation initialization
cv = KFold(n_splits=3, shuffle=True, random_state=42)

#Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(
    tuning_pipeline, 
    param_grid, 
    cv=cv, 
    scoring='neg_root_mean_squared_error',
    verbose=2, 
    n_jobs=-1
)

# Execute the search by fitting model on tranformed training data 
grid_search.fit(X_train_clean, y_train_transformed)


best_params_found = grid_search.best_params_
best_score = -grid_search.best_score_
print(f"Best Cross Validation RMSE on log-transformed target: {best_score:.4f}")

# Clean up parameter names for the final model
final_dt_params = {k.replace('regressor__', ''): v for k, v in best_params_found.items()}
final_dt_params['random_state'] = 42

print("Best Decision Tree parameters for final training:",final_dt_params)



# Initialize the final model
dt_best_model = DecisionTreeRegressor(**final_dt_params)

# Create the final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', dt_best_model)
])

# Fit the final pipeline on all training data
final_pipeline.fit(X_train_clean, y_train_transformed)

# Make predictions
predictions_log = final_pipeline.predict(X_test_clean)

# Inverse Transform Predictions 
predictions_original_scale = np.exp(predictions_log) 
predictions_original_scale = np.maximum(predictions_original_scale, 0)


submission = pd.DataFrame({
    'Hospital_Id': test_ids,
    'Transport_Cost': predictions_original_scale
})

submission.to_csv("submission.csv", index=False)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Cross Validation RMSE on log-transformed target: 2.1779
Best Decision Tree parameters for final training: {'max_depth': 5, 'min_samples_split': 2, 'random_state': 42}
[CV] END regressor__max_depth=5, regressor__min_samples_split=2; total time=   0.1s
[CV] END regressor__max_depth=5, regressor__min_samples_split=5; total time=   0.1s
[CV] END regressor__max_depth=10, regressor__min_samples_split=2; total time=   0.1s
[CV] END regressor__max_depth=10, regressor__min_samples_split=5; total time=   0.1s
[CV] END regressor__max_depth=15, regressor__min_samples_split=2; total time=   0.1s
[CV] END regressor__max_depth=15, regressor__min_samples_split=5; total time=   0.1s
[CV] END regressor__max_depth=20, regressor__min_samples_split=2; total time=   0.1s
[CV] END regressor__max_depth=25, regressor__min_samples_split=2; total time=   0.1s
[CV] END regressor__max_depth=25, regressor__min_samples_split=5; total time=   0.1s
[CV]