In [2]:
import shutil
import os

print(os.listdir('/kaggle/input/'))

path = "/kaggle/input/insurance-dbs"
dst_path = "/kaggle/working"

for file in os.listdir(path):
    database = os.path.join(path, file)
    shutil.copyfile(database, os.path.join(dst_path, file))

['insurance-premium-prediction', 'autofe-feature-importance', 'insurance-dbs', 'playground-series-s4e12']


In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import sys
import pandas as pd
from functools import reduce
from lifelines.utils import concordance_index
from ktools.metrics.stratified_concordance_index import stratified_concordance_index
import numpy as np # linear algebra


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/insurance-premium-prediction/README.md
/kaggle/input/insurance-premium-prediction/Insurance Premium Prediction Dataset.csv
/kaggle/input/autofe-feature-importance/robust_feature_importance.csv
/kaggle/input/autofe-feature-importance/combined_df.csv
/kaggle/input/insurance-dbs/cat_lg_with_original.db
/kaggle/input/insurance-dbs/cat_lossguide.db
/kaggle/input/insurance-dbs/cat_gpu.db
/kaggle/input/insurance-dbs/cat_depthwise.db
/kaggle/input/insurance-dbs/cat_symtree.db
/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [None]:
import pandas as pd
import pandas.api.types
import numpy as np
from typing import Union
from lifelines.utils import concordance_index


def stratified_concordance_index(solution : pd.DataFrame, 
                                 predictions : Union[pd.Series, np.ndarray], 
                                 event_binary_col_name : str,
                                 duration_col_name : str,
                                 group_col_name : str) -> float:
    
    """
    Solution dataframe should contain all necessary columns
    """

    solution['predictions'] = predictions
    solution.reset_index(inplace=True)
    solution_group_dict = dict(solution.groupby([group_col_name]).groups)
    metric_list = []

    for race in solution_group_dict.keys():

        indices = sorted(solution_group_dict[race])
        merged_df_race = solution.iloc[indices]

        c_index_race = concordance_index(
                        merged_df_race[duration_col_name],
                        -merged_df_race['predictions'],
                        merged_df_race[event_binary_col_name])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [5]:
from functools import reduce
from typing import Any, Dict, List, Tuple, Callable, Union
import numpy as np
import pandas as pd
from copy import deepcopy



class CrossValidationExecutor:

    def __init__(self,
                 sklearn_model_instance,
                 evaluation_metric : Callable,
                 kfold_object,
                 training_features : Union[List[str], None] = None,
                 use_test_as_valid = True,
                 num_classes = None,
                 verbose=1) -> None:
        
        self.model = sklearn_model_instance
        self._evaluation_metric = evaluation_metric
        self._kf = kfold_object
        self._num_splits = kfold_object.get_n_splits()
        self._training_features = training_features
        self._use_test_as_valid = use_test_as_valid
        self._num_classes = num_classes
        self._verbose = verbose

    def run(self, X : pd.DataFrame, y : Union[pd.DataFrame, pd.Series], additional_data=None, local_transform_list=[lambda x : x], output_transform_list=[lambda x : x[-1]]) -> Tuple[Tuple[float], np.ndarray, List[Any]]:

        training_features = X.columns.tolist() if self._training_features is None else self._training_features
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        if additional_data is not None:
            X_add, y_add = additional_data
            pd.testing.assert_index_equal(X.columns, X_add.columns, check_exact=True)
            pd.testing.assert_series_equal(X.dtypes, X_add.dtypes, check_exact=True)
            pd.testing.assert_index_equal(y.columns, y_add.columns, check_exact=True)
            pd.testing.assert_series_equal(y.dtypes, y_add.dtypes, check_exact=True)

        cv_results = []
        model_list = []
        # oof_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))
        oof_predictions = None
        # metric_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))
        metric_predictions = None

        for i, (train_index, val_index) in enumerate(self._kf.split(X, y)):
            
            X_full_test = X.loc[val_index, :]
            X_train, X_test = X.loc[train_index, training_features], X.loc[val_index, training_features]
            y_train, y_test = y.loc[train_index], y.loc[val_index]

            if additional_data is not None:
                X_train = pd.concat([X_train, X_add], axis=0)
                y_train = pd.concat([y_train, y_add], axis=0)

            X_train, y_train = reduce(lambda acc, func: func(acc), local_transform_list, (X_train, y_train))
            validation_set = None
            if self._use_test_as_valid:
                validation_set = [X_test, y_test]

            model = deepcopy(self.model).fit(X_train, y_train, validation_set=validation_set)
            model_list += [model]
            y_pred = model.predict(X_test)
            y_pred_processed = reduce(lambda acc, func: func(acc), output_transform_list, (X_full_test.copy(), y_pred))
            
            cv_results += [self._evaluation_metric(y_test, deepcopy(y_pred_processed))]

            if oof_predictions is None:
                oof_shape = (y.shape[0],) if len(y_pred.shape) == 1 else (y.shape[0], y_pred.shape[-1])
                oof_predictions = np.zeros(oof_shape)
            if metric_predictions is None:
                y_hat_shape = (y.shape[0],) if len(y_pred_processed.shape) == 1 else (y.shape[0], y_pred_processed.shape[-1])
                metric_predictions = np.zeros(y_hat_shape)

            oof_predictions[val_index] = y_pred
            metric_predictions[val_index] = y_pred_processed

            if self._verbose > 1:
                print(f"The CV results of the current fold is {cv_results[-1]}")

        oof_score = self._evaluation_metric(y, metric_predictions)
        mean_cv_score = np.mean(cv_results)
        score_tuple = (oof_score, mean_cv_score)

        if self._verbose > 0:
            print("#"*100)
            print("OOF prediction score : ", oof_score)
            print(f"Mean {self._num_splits}-cv results : {mean_cv_score} +- {np.std(cv_results)}")
            print("#"*100)

        return score_tuple, oof_predictions, model_list

In [None]:
from typing import List
import pandas as pd
import lightgbm as lgb
import sys
# from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split



class LGBMModel():



    def __init__(self,

                 num_boost_round=100,

                 early_stopping_rounds=20,

                 random_state=129,

                 verbose=-1,

                 n_jobs=1,

                 **lgb_param_grid,) -> None:

        super().__init__()

        self._num_boost_round = num_boost_round

        self._lgb_param_grid = {"verbose" : verbose, 

                                "early_stopping_rounds" : early_stopping_rounds,

                                "random_state" : random_state,

                                "n_jobs" : n_jobs,

                                **lgb_param_grid}

        self._callbacks = [

                            # log_evaluation(period=log_period), 

                            # early_stopping(stopping_rounds=stopping_rounds)

                           ]

        self._random_state = random_state

        

    def fit(self, X, y, validation_set = None, val_size=0.05):

        if validation_set is None:

            X_train, X_valid, y_train, y_valid = train_test_split(X, 

                                                                  y, 

                                                                  test_size=val_size, 

                                                                  random_state=self._random_state)

        else:

            X_train, y_train = X, y

            X_valid, y_valid = validation_set



        train_data = lgb.Dataset(X_train, label=y_train)

        val_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

        self.model = lgb.train(self._lgb_param_grid,

                                train_data,

                                num_boost_round=self._num_boost_round,

                                valid_sets=[train_data, val_data],

                                valid_names=['train', 'valid'],

                                callbacks=self._callbacks,
                                )
        return self

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred

    
import optuna
import joblib
import pandas as pd
import numpy as np
from optuna.samplers import TPESampler
from typing import *


class OptunaHyperparameterOptimizer():

    def __init__(self,
                 X_train : pd.DataFrame,
                 y_train : pd.DataFrame,
                 model,
                 param_grid_getter,
                 kfold_object,
                 metric : callable,
                 training_features : Union[List[str], None] = None,
                 direction : str = 'maximize',
                 n_trials : int = 100,
                 study_name : str = "ml_experiment",
                 explore_fraction : float = 0.1,
                 cross_validation_run_kwargs = {},
                 verbose=False,
                 random_state=42
                 ) -> None:
        
        super().__init__()
        self._X_train = X_train
        self._y_train = y_train
        self.model = model
        self._metric = metric
        self._training_features = training_features
        self._param_grid_getter = param_grid_getter
        self._kfold_object = kfold_object
        self._direction = direction
        self._n_trials = n_trials
        self._study_name = study_name
        self._explore_fraction = explore_fraction
        self._cross_validation_run_kwargs = cross_validation_run_kwargs
        self._verbose = verbose
        self._random_state = random_state

    def optimize(self, 
                 inital_parameters : Dict[str, float] = None,
                 initial_distribution : Dict[str, Any] = None,
                 timeout : int = 3600
                 ):
        if self._verbose:
            print("#"*100)
            print("Starting Optuna Optimizer")
            print("#"*100)

        sampler = TPESampler(n_startup_trials=int(self._n_trials*self._explore_fraction),
                             seed=self._random_state)

        storage_name = "sqlite:///{}.db".format(self._study_name)
        study = optuna.create_study(sampler=sampler,
                                    study_name=self._study_name, 
                                    direction=self._direction,
                                    storage=storage_name,
                                    load_if_exists=True)
        
        if inital_parameters is not None:
            fixed_trial = optuna.trial.FixedTrial(inital_parameters)
            study.add_trial(optuna.create_trial(
                            params=inital_parameters,
                            distributions=initial_distribution,
                            value=self._objective(fixed_trial)
            ))
        study.optimize(self._objective, n_trials=self._n_trials, timeout=timeout)
        # joblib.dump(study, "/kaggle/working/study.pkl")
        optimal_params = study.best_params
        return optimal_params
    
    def _objective(self, trial : optuna.Trial):
        parameters = self._param_grid_getter.get(trial)

        cv_scores, oof, model_list = CrossValidationExecutor(self.model(**parameters),
                                                             self._metric,
                                                             self._kfold_object,
                                                             training_features=self._training_features,
                                                             use_test_as_valid=True
                                                             ).run(self._X_train, 
                                                                   self._y_train, 
                                                                   **self._cross_validation_run_kwargs)

        return cv_scores[0]

    

from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb




class XGBoostModel():



    def __init__(self,

                 eval_verbosity=False,

                 num_boost_round=100,

                 early_stopping_rounds=20,

                 random_state=129,

                 verbosity=0,

                 n_jobs=-1,

                 **xgb_param_grid) -> None:

        super().__init__()

        self._eval_verbosity = eval_verbosity

        self._num_boost_round = num_boost_round

        self._early_stopping_rounds = early_stopping_rounds

        self._xgb_param_grid = {"verbosity" : verbosity,

                                "random_state" : random_state,

                                "n_jobs" : n_jobs,

                                **xgb_param_grid}

        self._random_state = random_state

    

    def fit(self, X, y, validation_set = None, val_size=0.05):

        if validation_set is None:

            X_train, X_valid, y_train, y_valid = train_test_split(X, 

                                                                  y, 

                                                                  test_size=val_size, 

                                                                  random_state=self._random_state)

        else:

            X_train, y_train = X, y

            X_valid, y_valid = validation_set

        train_data = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

        valid_data = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)

        eval_data = [(train_data, 'train'), (valid_data, 'eval')]

    

        self.model = xgb.train(

            self._xgb_param_grid, 

            train_data, 

            evals=eval_data,                       

            early_stopping_rounds=self._early_stopping_rounds,   

            num_boost_round=self._num_boost_round,        

            verbose_eval=self._eval_verbosity                 

        )

        return self



    def predict(self, X):

        test_data = xgb.DMatrix(X, enable_categorical=True)

        y_pred = self.model.predict(test_data)

        return y_pred

    

    

import catboost as cat

from typing import List

import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor, Pool, train





class CatBoostModel():



    def __init__(self,

                 num_boost_round=100,

                 early_stopping_rounds=20,

                 random_state=129,

                 verbose=False,

                 **catboost_params) -> None:

        super().__init__()

        self._num_boost_round = num_boost_round

        self._stopping_rounds = early_stopping_rounds

        self._catboost_params = {"random_seed" : random_state,

                                 "verbose" : verbose,

                                 **catboost_params}

        self._random_state = random_state



    def fit(self, X, y, validation_set = None, val_size=0.05):

        self.cat_col_names = [col_name for col_name in X.columns if X[col_name].dtype == 'category']



        if validation_set is None:

            X_train, X_valid, y_train, y_valid = train_test_split(X, 

                                                                  y, 

                                                                  test_size=val_size, 

                                                                  random_state=self._random_state)

        else:

            X_train, y_train = X, y

            X_valid, y_valid = validation_set

            

        train_pool = Pool(data=X_train, label=y_train, cat_features=self.cat_col_names)

        val_pool = Pool(data=X_valid, label=y_valid, cat_features=self.cat_col_names)

        self.model = cat.train(

                params=self._catboost_params,           

                dtrain=train_pool,   

                eval_set=val_pool,

                num_boost_round=self._num_boost_round,   

                early_stopping_rounds=self._stopping_rounds  

                )

        return self



    def predict(self, X):

        test_pool = Pool(data=X, cat_features=self.cat_col_names)

        y_pred = self.model.predict(test_pool)

        return y_pred

    

from sklearn.preprocessing import MinMaxScaler, TargetEncoder
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

    

class HGBModel():

    def __init__(self,
                 smooth="auto", 
                 target_type="continuous",
                 num_boost_round=100,
                 early_stopping=True,
                 validation_fraction=0.05,
                 early_stopping_rounds=20,
                 verbose=0,
                 random_state=129,
                 **hgb_params) -> None:
        hgb_params = {"max_iter" : num_boost_round,
                      "early_stopping" : early_stopping,
                      "validation_fraction" : validation_fraction,
                      "n_iter_no_change" : early_stopping_rounds,
                      "verbose" : verbose,
                      "random_state" : random_state,
                      "categorical_features" : "from_dtype",
                      **hgb_params}
        
        self._target_enc = TargetEncoder(target_type=target_type, 
                                         smooth=smooth, 
                                         random_state=random_state)
        self._target_type = target_type
        if target_type == "continuous":
            self.model = HistGradientBoostingRegressor(**hgb_params)
        else:
            self.model = HistGradientBoostingClassifier(**hgb_params)

    def fit(self, X, y, validation_set=None, **kwargs):
        categorical_features = [col_name for col_name in X.columns if X[col_name].dtype == 'category']
        target_enc_values = self._target_enc.fit_transform(X[categorical_features], y)
        X = X.drop(columns=categorical_features)
        X[categorical_features] = target_enc_values
        self.model.fit(X, y)
        return self

    def predict(self, X):
        categorical_features = [col_name for col_name in X.columns if X[col_name].dtype == 'category']
        target_enc_values = self._target_enc.transform(X[categorical_features])
        X = X.drop(columns=categorical_features)
        X[categorical_features] = target_enc_values
        if self._target_type == "continuous":
            y_pred = self.model.predict(X)
        else:
            y_pred = self.model.predict_proba(X)
        return y_pred



from ydf import GradientBoostedTreesLearner, Task
from typing import List, Union
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


class YDFGBoostModel():

    target_col = "target"

    def __init__(self,
                 num_boost_round : int = 100,
                 early_stopping_rounds : int = 20,
                 task : str = "REGRESSION",
                 categorical_algorithm : str = "RANDOM",
                 loss : str = "SQUARED_ERROR",
                 random_state : int = 42,
                 verbose : bool = False,
                 **model_kwargs
                 ) -> None:
        super().__init__()
        self._random_state = random_state
        self._verbose = verbose

        task = Task.CLASSIFICATION if task.upper() == "CLASSIFICATION" else Task.REGRESSION
        self.model = GradientBoostedTreesLearner(label = self.target_col,
                                                 task = task,
                                                 categorical_algorithm = categorical_algorithm,
                                                 loss = loss,
                                                 early_stopping_num_trees_look_ahead = early_stopping_rounds,
                                                 num_trees = num_boost_round,
                                                 **model_kwargs)

    def _convert_back_to_dataset(self, X, y):
        X[self.target_col] = y.values
        return X
    
    def fit(self, X : pd.DataFrame, y : Union[pd.DataFrame, pd.Series, np.ndarray], 
            validation_set = None, val_size=0.05):
        
        if validation_set is None:
            X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                                  y, 
                                                                  test_size=val_size, 
                                                                  random_state=self._random_state)
        else:
            X_train, y_train = X, y
            X_valid, y_valid = validation_set
        
        train_df = self._convert_back_to_dataset(X_train, y_train)
        valid_df = self._convert_back_to_dataset(X_valid, y_valid)

        self.model = self.model.train(train_df, valid=valid_df, verbose=self._verbose)
        return self

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred

In [6]:
from dataclasses import dataclass
from typing import *
import pandas as pd


def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [7]:
class BaseLGBMParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            # "boosting_type" : "gbdt",
            # "early_stopping_rounds" : trial.suggest_int("early_stopping_rounds", 1, 200, log=True),
            "early_stopping_rounds" : 20,
            "num_leaves" : trial.suggest_int("num_leaves", 2, 500),
            "max_depth" : trial.suggest_int("max_depth", 0, 50),
            "learning_rate" : trial.suggest_float("learning_rate", 1e-2, 1.0, log=True),
            # "num_boost_round" : trial.suggest_int("num_boost_round", 50, 3000),
            "num_boost_round" : 10000,
            "subsample" : trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree" : trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha" : trial.suggest_float("reg_alpha", 1e-6, 10, log=True),
            "reg_lambda" : trial.suggest_float("reg_lambda", 1e-6, 10, log=True),
            "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf", 1, 100),
            "feature_fraction" : trial.suggest_float("feature_fraction", 0.5, 1.0),
            # "bagging_fraction" : trial.suggest_float("bagging_fraction", 0.5, 1.0),
            # "bagging_freq" : trial.suggest_int("bagging_freq", 1, 5),
            "max_bin" : trial.suggest_int("max_bin", 50, 5000, log=True),
            # "data_sample_strategy" : "bagging",
            'min_child_weight': trial.suggest_float('min_child_weight', 1e-4, 100, log=True),
            # "scale_pos_weight" : trial.suggest_float("scale_pos_weight", 1, 1000, log=True),
            'cat_smooth': trial.suggest_float('cat_smooth', 1, 100, log=True),
            'objective' : 'regression',
            'metric' : 'rmse'}
        return params


class LGBMGBDTParamGrid(BaseLGBMParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "boosting_type" : "gbdt",
            "data_sample_strategy" : "bagging"
        }
        params.update(base_params)
        return params


class LGBMGBDTGossParamGrid(BaseLGBMParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "boosting_type" : "gbdt",
            "data_sample_strategy" : "goss"
        }
        params.update(base_params)
        return params

    
class LGBMDARTParamGrid(BaseLGBMParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "boosting_type" : "dart",
        }
        params.update(base_params)
        return params

    
class LGBMRFParamGrid(BaseLGBMParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "boosting_type" : "rf",
            "num_boost_round" : 10000
        }
        params.update(base_params)
        return params

    

class BaseXGBoostParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            # "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            # "max_bin" : trial.suggest_int("max_bin", 50, 5000, log=True),
            "max_bin" : 10000,
            "early_stopping_rounds" : 20,
            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 50),
            # "num_boost_round" : trial.suggest_int("num_boost_round", 50, 3000),
            "num_boost_round" : 10000,
            "gamma" : trial.suggest_float("gamma", 0, 10),
            "min_child_weight" : trial.suggest_float("min_child_weight", 0.1, 100, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 10.0, log=True),
            "max_cat_threshold" : trial.suggest_int("max_cat_threshold", 1, 1000, log=True),
            # "scale_pos_weight" : trial.suggest_float("scale_pos_weight", 1, 1000, log=True),
            "sampling_method" : "uniform",
            # "grow_policy" : trial.suggest_categorical("grow_policy", ["lossguide", "depthwise"]),
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "tree_method":"gpu_hist",
            "device" : "cuda"
        }
        return params

class XGBoostGBTreeLossguide(BaseXGBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "booster" : "gbtree",
            "grow_policy" : "lossguide"
        }
        params.update(base_params)
        return params

class XGBoostGBTreeDepthwise(BaseXGBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "booster" : "gbtree",
            "grow_policy" : "depthwise"
        }
        params.update(base_params)
        return params


class XGBoostGBTreeLinear(BaseXGBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "booster" : "gblinear",
        }
        params.update(base_params)
        return params

    

class XGBoostDART(BaseXGBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "booster" : "dart",
        }
        params.update(base_params)
        return params

    

class BaseCatBoostParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            # "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "early_stopping_rounds" : 20, #: trial.suggest_int("early_stopping_rounds", 1, 200, log=True),
            "max_bin" : trial.suggest_int("max_bin", 2, 5000, log=True),
            # "max_bin" : 5000,
            "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.3, log=True),
            "depth": trial.suggest_int("depth", 3, 16),
            "num_boost_round" : 10000,
#             trial.suggest_int("num_boost_round", 50, 5000, log=True),
            "bagging_temperature" : trial.suggest_float("bagging_temperature", 0.1, 100, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bylevel" : trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "min_data_in_leaf" : trial.suggest_float("min_data_in_leaf", 1, 1000, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-6, 10.0, log=True),
#             "grow_policy" : trial.suggest_categorical("grow_policy", ["Lossguide", "Depthwise", "SymmetricTree"]),
            "leaf_estimation_iterations" : trial.suggest_int("leaf_estimation_iterations", 1, 5),
            # "scale_pos_weight" : trial.suggest_float("scale_pos_weight", 1, 1000, log=True),
            "random_strength" : trial.suggest_float("random_strength", 0.1, 10),
            "leaf_estimation_method" : trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
            "loss_function" : "Cox",
            'eval_metric': 'Cox',
            # 'task_type' : "CPU",
        }
        return params

class GPUCatBoostParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            "early_stopping_rounds" : 20, #: trial.suggest_int("early_stopping_rounds", 1, 200, log=True),
            "max_bin" : trial.suggest_int("max_bin", 2, 5000, log=True),
            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.3, log=True),
            "depth": trial.suggest_int("depth", 3, 16),
            "num_boost_round" : 10000,
            "bagging_temperature" : trial.suggest_float("bagging_temperature", 0.1, 100, log=True),
            # "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            # "colsample_bylevel" : trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "min_data_in_leaf" : trial.suggest_float("min_data_in_leaf", 1, 1000, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-6, 10.0, log=True),
            "leaf_estimation_iterations" : trial.suggest_int("leaf_estimation_iterations", 1, 5),
            # "scale_pos_weight" : trial.suggest_float("scale_pos_weight", 1, 1000, log=True),
            # "random_strength" : trial.suggest_float("random_strength", 0.1, 10),
            "leaf_estimation_method" : trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
            # 'loss_function' : "Logloss",
            # 'eval_metric': 'AUC',
            "loss_function" : "RMSE",
            'eval_metric': 'RMSE',
            'task_type' : "GPU",
            'devices' : '0,1',
            "bootstrap_type" : "Bayesian"
        }
        return params

class CatBoostDepthWise(BaseCatBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "grow_policy" : "Depthwise",
        }
        params.update(base_params)
        return params

    

class CatBoostLossGuide(BaseCatBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "grow_policy" : "Lossguide",
        }
        params.update(base_params)
        return params

    
class CatBoostSymmetricTree(BaseCatBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "grow_policy" : "SymmetricTree",
        }
        params.update(base_params)
        return params



class CatBoostRegion(BaseCatBoostParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "grow_policy" : "Region",
            'task_type' : "GPU",
        }
        params.update(base_params)
        return params

    
class HGBParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            "max_bins" : trial.suggest_int("max_bins", 2, 255),
            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 50),
            "max_leaf_nodes" : trial.suggest_int("max_leaf_nodes", 2, 400, log=True),
            "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 2, 500),
            # "num_boost_round" : trial.suggest_int("num_boost_round", 50, 3000),
            "num_boost_round" : 3000,
            "early_stopping_rounds" : 20,
            "validation_fraction" : trial.suggest_float("validation_fraction", 0.05, 0.2),
            # "early_stopping_rounds" : trial.suggest_int("early_stopping_rounds", 1, 200, log=True),
            "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 10.0, log=True),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "interaction_cst" : trial.suggest_categorical("interaction_cst", ["pairwise", "no_interactions"]),
            "tol": trial.suggest_float("tol", 1e-7, 1e-2, log=True), 
            "smooth" : trial.suggest_float("smooth", 1e2, 1e4, log=True),           
        }
        return params


class YDFParamGrid():
    @staticmethod
    def get(trial : optuna.Trial):
        params = {
            "categorical_algorithm" : trial.suggest_categorical("categorical_algorithm", ["CART", "RANDOM"]),
            "categorical_set_split_min_item_frequency" : trial.suggest_int("categorical_set_split_min_item_frequency", 1, 200),
            # "goss_alpha" : trial.suggest_float("goss_alpha", 0, 1),
            # "goss_beta" : trial.suggest_float("goss_beta", 0, 1),
            # "honest" : trial.suggest_categorical("honest", [True, False]),
            "l1_regularization" : trial.suggest_float("l1_regularization", 1e-4, 1e2, log=True),
            "l2_categorical_regularization" : trial.suggest_float("l2_categorical_regularization", 1e-4, 1e2, log=True),
            "l2_regularization" : trial.suggest_float("l2_regularization", 1e-4, 1e2, log=True),
            "max_depth" : trial.suggest_int("max_depth", -1, 300),
            "max_num_nodes" : trial.suggest_int("max_num_nodes", -1, 200),
            "min_examples" : trial.suggest_int("min_examples", 1, 1e3, log=True),
            "num_boost_round" : 3000,
            "task" : "REGRESSION",
            "loss" : "SQUARED_ERROR"
        }
        return params
    

class YDFLocalParamGrid(YDFParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "growing_strategy" : "LOCAL",
        }
        params.update(base_params)
        return params
    
class YDFBestGlobalParamGrid(YDFParamGrid):
    def get(self, trial : optuna.Trial):
        base_params = super().get(trial)
        params = {
            "growing_strategy" : "BEST_FIRST_GLOBAL",
            "subsample" : trial.suggest_float("subsample", 0.5, 1),
        }
        params.update(base_params)
        return params

In [8]:
from copy import deepcopy
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler

for root, dirs, files in os.walk("/kaggle/input/"):
    for file in files:
        file_path = os.path.join(root, file)
        print(file_path)
        if "train.csv" in file_path:
            train_csv_path = file_path
        elif "test.csv" in file_path:
            test_csv_path = file_path
        elif "sample" in file_path:
            sample_sub_csv_path = file_path
target_col_name = pd.read_csv(train_csv_path).columns[-1]


class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings

class ConvertAllToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.training_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype(str).astype('category')
        return settings

class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings

class ConvertObjectToStrCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype(str).astype('category')
        return settings

class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        target = settings.target_col_name
        settings.combined_df[target] = np.log1p(settings.combined_df[target])
        return settings
    
class OrdinalEncode():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        ordinal_encoder = OrdinalEncoder(encoded_missing_value=-1, handle_unknown="use_encoded_value", unknown_value=-1)
        train_df[settings.categorical_col_names] = ordinal_encoder.fit_transform(train_df[settings.categorical_col_names])
        test_df[settings.categorical_col_names] = ordinal_encoder.transform(test_df[settings.categorical_col_names])
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        settings.combined_df[settings.categorical_col_names] = settings.combined_df[settings.categorical_col_names].astype(int)
        return settings

/kaggle/input/insurance-premium-prediction/README.md
/kaggle/input/insurance-premium-prediction/Insurance Premium Prediction Dataset.csv
/kaggle/input/autofe-feature-importance/robust_feature_importance.csv
/kaggle/input/autofe-feature-importance/combined_df.csv
/kaggle/input/insurance-dbs/cat_lg_with_original.db
/kaggle/input/insurance-dbs/cat_lossguide.db
/kaggle/input/insurance-dbs/cat_gpu.db
/kaggle/input/insurance-dbs/cat_depthwise.db
/kaggle/input/insurance-dbs/cat_symtree.db
/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [None]:
class CreateSurvivalTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df["survival_target"] = np.where(settings.combined_df['efs'].astype(bool), 
                                                         settings.combined_df['efs_time'], 
                                                         -settings.combined_df['efs_time'])
        return settings    

settings = DataSciencePipelineSettings(train_csv_path,
                                       test_csv_path,
                                       target_col_name,
                                       )
transforms = [
             FillNullValues.transform,
             OrdinalEncode.transform,
             ConvertObjectToStrCategorical.transform,
             CreateSurvivalTarget.transform
             ]

settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)
X, y = train.drop(columns=["survival_target"]), train[["survival_target"]]

In [None]:
train_features = [f for f in X.columns.tolist() if f not in [target_col_name, "efs"]]
indices = np.array([X.columns.get_loc(col) for col in ['efs', 'efs_time', 'race_group']])

In [None]:
def output_transform(input):
    (X_test, y_pred) = input
    X_test['predictions'] = y_pred
    return X_test

def sci_metric(y_test, y_processed):
    if isinstance(y_processed, np.ndarray):
        data = y_processed[:, indices]
        solution = pd.DataFrame(columns=['efs', 'efs_time', 'race_group'], data=data)
        predicted = y_processed[:, -1]
    else:
        solution = y_processed
        predicted = y_processed['predictions']

    metric_value = stratified_concordance_index(solution,
                                                predicted,
                                                'efs',
                                                'efs_time',
                                                'race_group')
    return metric_value

In [13]:
model = CatBoostModel
kf = KFold(n_splits=5, shuffle=True, random_state=42)

optimizer = OptunaHyperparameterOptimizer(X,
                                          y,
                                          model,
                                          CatBoostDepthWise(),
                                          kf,
                                          sci_metric,
                                          training_features=train_features,
                                          direction = 'maximize',
                                          n_trials=100,
                                          cross_validation_run_kwargs={'output_transform_list': [output_transform]}
                                          study_name = 'cat_dw',
                                          random_state=42)

best_params = optimizer.optimize(timeout=3600*10)

Initial memory usage: 298.08 MB
Reduced memory usage: 230.46 MB
Memory reduced by: 22.7%
Initial memory usage: 214.50 MB
Reduced memory usage: 180.93 MB
Memory reduced by: 15.7%
