In [1]:
%pip install catboost ipywidgets nbformat optuna optuna-integration

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import optuna
from optuna.integration import CatBoostPruningCallback
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
TARGET = "metastatic_diagnosis_period"

train_df = pd.read_csv("data/train_preprocessed.csv")
train_df.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
0,268700,5,0,2,724,2,7,39,0,29.161171,...,52.55,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18,191.0
1,484983,4,3,13,629,0,0,55,0,35.36,...,49.3,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71,33.0
2,277055,5,0,4,925,3,4,59,0,29.161171,...,68.5,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58,157.0
3,320055,2,1,4,900,3,4,59,0,29.161171,...,63.34,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46,146.0
4,190386,5,0,4,934,3,4,71,0,29.161171,...,59.45,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08,286.0


In [4]:
train_df.shape

(13173, 152)

In [5]:
train_df[TARGET].nunique()

366

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.3, random_state=777
)
train_pool = Pool(X_train, y_train)
eval_pool = Pool(X_test, y_test)

In [7]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1, log=True),
        "depth": trial.suggest_int("depth", 5, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 1, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "eval_metric": "RMSE",
    }
    model = CatBoostRegressor(**params, silent=True)
    model.fit(
        train_pool,
        eval_set=eval_pool,
        use_best_model=True,
        early_stopping_rounds=300,
        callbacks=[CatBoostPruningCallback(trial, "RMSE")],
    )
    predictions = model.predict(X_test)
    return root_mean_squared_error(y_test, predictions)

In [8]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-04-21 16:22:33,454] A new study created in memory with name: no-name-ccf3d93b-0144-430d-a7df-74932509a8be
  callbacks=[CatBoostPruningCallback(trial, "RMSE")],
[I 2024-04-21 16:22:35,286] Trial 0 finished with value: 82.8368947589394 and parameters: {'iterations': 822, 'learning_rate': 0.07580342191541896, 'depth': 6, 'l2_leaf_reg': 0.17547750928129421, 'random_strength': 5.939128600476054, 'subsample': 0.37686761520469114, 'bagging_temperature': 1.2987081611872264, 'colsample_bylevel': 0.177359210503135, 'min_data_in_leaf': 95}. Best is trial 0 with value: 82.8368947589394.
  callbacks=[CatBoostPruningCallback(trial, "RMSE")],
[I 2024-04-21 16:22:42,301] Trial 1 finished with value: 83.77102057350179 and parameters: {'iterations': 753, 'learning_rate': 0.2579603148702219, 'depth': 10, 'l2_leaf_reg': 0.4207959840144661, 'random_strength': 2.488577766018212, 'subsample': 0.8370141570238188, 'bagging_temperature': 3.04562421931767, 'colsample_bylevel': 0.5211081287897354, 'min_da

In [9]:
test_df = pd.read_csv("data/test_preprocessed.csv")
test_pool = Pool(test_df)

model = CatBoostRegressor(**study.best_params, silent=True)
model.fit(train_pool, eval_set=eval_pool, use_best_model=True)
print(f"Best parameters: {study.best_params}")
print(f"Best value: {study.best_value}")

predictions = model.predict(test_pool)
preds = np.uint16(np.around(np.clip(predictions, a_min = 0, a_max = np.inf),0))
submission = pd.DataFrame({"patient_id": test_df["patient_id"], TARGET: preds})
submission.to_csv("submission.csv", index=False)

Best parameters: {'iterations': 582, 'learning_rate': 0.5516634028471802, 'depth': 5, 'l2_leaf_reg': 0.40122372808741513, 'random_strength': 3.482612563612328e-06, 'subsample': 0.39965647309969143, 'bagging_temperature': 8.194105506908897, 'colsample_bylevel': 0.9557236357201065, 'min_data_in_leaf': 41}
Best value: 82.52398595775345
