In [1]:
%pip install catboost ipywidgets nbformat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool, EFeaturesSelectionAlgorithm, EShapCalcType
from sklearn.model_selection import train_test_split

In [3]:
TARGET = "metastatic_diagnosis_period"

train_df = pd.read_csv("data/train_preprocessed.csv")
train_df.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,dummy_metastatic_cancer_diagnosis_code_31,dummy_metastatic_cancer_diagnosis_code_40,dummy_metastatic_cancer_diagnosis_code_21,dummy_metastatic_cancer_diagnosis_code_25,dummy_metastatic_cancer_diagnosis_code_13,dummy_metastatic_first_novel_treatment_2,dummy_metastatic_first_novel_treatment_0,dummy_metastatic_first_novel_treatment_1,dummy_metastatic_first_novel_treatment_type_1,dummy_metastatic_first_novel_treatment_type_0
0,268700,5,0,2,724,2,7,39,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
1,484983,4,3,13,629,0,0,55,0,35.36,...,0,0,0,0,0,1,0,0,1,0
2,277055,5,0,4,925,3,4,59,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
3,320055,2,1,4,900,3,4,59,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
4,190386,5,0,4,934,3,4,71,0,29.161171,...,0,0,0,0,0,1,0,0,1,0


In [4]:
train_df.shape

(13173, 365)

In [5]:
train_df[TARGET].nunique()

366

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.5, random_state=777
)
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [8]:
model = CatBoostRegressor(
    iterations=1000, depth=6, loss_function="RMSE"
)
summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=list(range(X_train.shape[1])),
    num_features_to_select=100,
    steps=10,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level="Silent",
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [9]:
selected_features = summary['selected_features_names']
json.dump(selected_features, open('selected_features.json', 'w'))

In [10]:
test_df = pd.read_csv("data/test_preprocessed.csv")
test_pool = Pool(test_df)

predictions = model.predict(test_pool)
preds = np.uint16(np.around(np.clip(predictions, a_min = 0, a_max = np.inf),0))
submission = pd.DataFrame({"patient_id": test_df["patient_id"], TARGET: preds})

In [12]:
submission.to_csv("submission.csv", index=False)