In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Patient-Recovery-Prediction-Challenge/sample_submission.csv
/kaggle/input/Patient-Recovery-Prediction-Challenge/train.csv
/kaggle/input/Patient-Recovery-Prediction-Challenge/test.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


In [4]:
# Load dataset
train_df = pd.read_csv("/kaggle/input/Patient-Recovery-Prediction-Challenge/train.csv")

# Drop ID column
train_df = train_df.drop(columns=["Id"])

# Convert 'Lifestyle Activities' to numeric binary (Yes=1, No=0)
train_df["Lifestyle Activities_Yes"] = train_df["Lifestyle Activities"].apply(lambda x: 1 if x == "Yes" else 0)
train_df = train_df.drop(columns=["Lifestyle Activities"])

# Separate features and target
X = train_df.drop(columns=["Recovery Index"])
y = train_df["Recovery Index"]


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize base model
xgb_model = XGBRegressor(
    objective="reg:squarederror",  # for regression problems
    random_state=42,
    n_estimators=200,              # number of trees
    learning_rate=0.1,
    max_depth=4
)


In [7]:
# Train model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_val)

# Evaluate
mse = mean_squared_error(y_val, y_pred)
print("Validation MSE:", mse)


Validation MSE: 4.300182631863591


In [8]:
# Define parameter grid for tuning
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=1
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


In [9]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print("Best Model Validation MSE:", mse)


Best Model Validation MSE: 4.230401591620511


In [10]:
# Load test data
test_df = pd.read_csv("/kaggle/input/Patient-Recovery-Prediction-Challenge/test.csv")

# Encode 'Lifestyle Activities'
test_df["Lifestyle Activities_Yes"] = test_df["Lifestyle Activities"].apply(lambda x: 1 if x == "Yes" else 0)
test_df = test_df.drop(columns=["Lifestyle Activities"])

# Keep IDs for submission
ids = test_df["Id"]
test_df = test_df.drop(columns=["Id"])

# Predict using best model
test_preds = best_model.predict(test_df)

# Save submission
submission = pd.DataFrame({"Id": ids, "Recovery Index": test_preds})
submission.to_csv("submission_xgboost.csv", index=False)

print("✅ submission_xgboost.csv created successfully!")


✅ submission_xgboost.csv created successfully!
