In [None]:
# !pip install -U scikit-learn

In [1]:
from functools import reduce
import os
import random
from typing import Any, Dict, List, Union
import torch
from copy import deepcopy
from dataclasses import dataclass
from typing import *
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [3]:
class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    

class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings
    
class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df[settings.target_col_name] = np.log1p(settings.combined_df[settings.target_col_name])
        return settings

class CreateYuweiFeatures():

    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)

        df = settings.combined_df


        cat_cols = [col for col in df.columns if df[col].dtype == 'object']
        
        df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
        df['Days Passed'] = (df['Policy Start Date'].max() - df['Policy Start Date']).dt.days
        df['numYear'] = df['Policy Start Date'].dt.year
        num_day = df['Policy Start Date'].dt.day
        df['numMonth'] = df['Policy Start Date'].dt.month
        
        df['catMonth_name'] = df['Policy Start Date'].dt.month_name()
        df['catDay_of_week'] = df['Policy Start Date'].dt.day_name()
        
        df['numWeek'] = df['Policy Start Date'].dt.isocalendar().week
        
        df['numYear_sin'] = np.sin(2 * np.pi * df['numYear'])
        df['numMonth_sin'] = np.sin(2 * np.pi * df['numMonth'] / 12) 
        df['numMonth_cos'] = np.cos(2 * np.pi * df['numMonth'] / 12)
        df['numGroup']=(df['numYear']-2020)*48+df['numMonth']*4+num_day//7
        df['contract length'] = pd.cut(df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')], labels=[0, 1, 2]).astype(int)
        
        print("done initial time features")

        cat_cols += ['catMonth_name', 'catDay_of_week']
        temp = [x for x in cat_cols if x not in ['Location', 
                                                 'Education Level', 
                                                 'Policy Type', 
                                                 'Smoking Status',
                                                 'Marital Status',
                                                 'Exercise Frequency',
                                                 'Gender', 
                                                 'Occupation', 
                                                 'catMonth_name', 
                                                 'Property Type', 
                                                 'catMonth_name']]
        for col in temp:
            print(f"frequency encoding col: {col}")
            freq_encoding = df[col].value_counts().to_dict()

            new_col_name = f"{col}_freq"
            df[new_col_name] = df[col].map(freq_encoding)
            df[new_col_name] = df[col].map(freq_encoding)

            cat_cols += [new_col_name]


        num_cols = [col for col in df.columns if col not in cat_cols]
        df[cat_cols] = df[cat_cols].fillna('None').astype(str).astype('category')
        df[num_cols] = df[num_cols].fillna(0).astype(float)


        df['cat_annual_income'] = df['Annual Income'].astype(str).astype('category')
        df['cat_health_score'] = df['Health Score'].astype(str).astype('category')
        df['cat_credit_score'] = df['Credit Score'].astype(str).astype('category')

        df['catHealth vs Claims'] = df['Health Score'] / (df['Previous Claims'] + 2)
        # df['catClaims v Duration'] = df['Previous Claims'] / df['Insurance Duration']
        # df['Cat Credit Score'] = df['Credit Score'].copy()
        df['catInt Credit Score'] = df['Credit Score'].apply(lambda x: int(x) if pd.notna(x) else x)
        df['HealthScore'] = df['Health Score'].apply(lambda x: int(x) if pd.notna(x) else x)

        # df['HealthScore'] = df['Health Score'].apply(lambda x: int(x) if pd.notna(x) else x)
        
        settings.categorical_col_names += [
                                            'catHealth vs Claims',
                                            'catInt Credit Score'
                                        ]
        # df = df.drop(columns='Policy Start Date')
        # settings.categorical_col_names.remove('Policy Start Date')
        
        df[settings.categorical_col_names] = df[settings.categorical_col_names].astype(str)
        settings.combined_df = df
        return settings

In [4]:
# for root, dirs, files in os.walk("/kaggle/input/"):
#     for file in files:
#         file_path = os.path.join(root, file)
#         print(file_path)
#         if "train.csv" in file_path:
#             train_csv_path = file_path
#         elif "test.csv" in file_path:
#             test_csv_path = file_path
# target_col_name = pd.read_csv(train_csv_path).columns[-1]

train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/train.csv"
original_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/original.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/test.csv"
sample_sub_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/sample_submission.csv"
target_col_name = "Premium Amount"

In [5]:
from functools import reduce
from typing import Any, Dict, List, Tuple
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.metrics import roc_auc_score, accuracy_score, root_mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold


class CrossValidationExecutor:

    def __init__(self,
                 sklearn_model_instance,
                 evaluation_metric : callable,
                 kfold_object,
                 use_test_as_valid=True,
                 num_classes=None,
                 verbose=1) -> None:
        
        self.model = sklearn_model_instance
        self._evaluation_metric = evaluation_metric
        self._kf = kfold_object
        self._num_splits = kfold_object.get_n_splits()
        self._use_test_as_valid = use_test_as_valid
        self._num_classes = num_classes
        self._verbose = verbose

    def run(self, X, y, additional_data=None, local_transform_list=[lambda x : x], output_transform_list=[lambda x : x]) -> Tuple[Tuple[float], np.ndarray, List[Any]]:
        if additional_data is not None:
            X_add, y_add = additional_data
            pd.testing.assert_index_equal(X.columns, X_add.columns, check_exact=True)
            pd.testing.assert_series_equal(X.dtypes, X_add.dtypes, check_exact=True)
            pd.testing.assert_index_equal(y.columns, y_add.columns, check_exact=True)
            pd.testing.assert_series_equal(y.dtypes, y_add.dtypes, check_exact=True)

        cv_results = []
        model_list = []
        oof_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))
        metric_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))

        for i, (train_index, val_index) in enumerate(self._kf.split(X, y)):
            
            X_train, X_test = X.iloc[train_index], X.iloc[val_index]
            y_train, y_test = y.iloc[train_index], y.iloc[val_index]

            if additional_data is not None:
                X_train = pd.concat([X_train, X_add], axis=0)
                y_train = pd.concat([y_train, y_add], axis=0)

            X_train, y_train = reduce(lambda acc, func: func(acc), local_transform_list, (X_train, y_train))
            validation_set = None
            if self._use_test_as_valid:
                validation_set = [X_test, y_test]

            model = deepcopy(self.model).fit(X_train, y_train, validation_set=validation_set)
            model_list += [model]
            y_pred = model.predict(X_test)
            y_pred_processed = reduce(lambda acc, func: func(acc), output_transform_list, y_pred)
            
            cv_results += [self._evaluation_metric(y_test, y_pred_processed)]
            oof_predictions[val_index] = y_pred
            metric_predictions[val_index] = y_pred_processed

            if self._verbose > 1:
                print(f"The CV results of the current fold is {cv_results[-1]}")

        oof_score = self._evaluation_metric(y, metric_predictions)
        mean_cv_score = np.mean(cv_results)
        score_tuple = (oof_score, mean_cv_score)

        if self._verbose > 0:
            print("#"*100)
            print("OOF prediction score : ", oof_score)
            print(f"Mean {self._num_splits}-cv results : {mean_cv_score} +- {np.std(cv_results)}")
            print("#"*100)

        return score_tuple, oof_predictions, model_list

In [7]:
from typing import List
import pandas as pd
import lightgbm as lgb
import sys
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split


class LGBMModel():

    def __init__(self,
                 num_boost_round=100,
                 early_stopping_rounds=20,
                 random_state=129,
                 verbose=-1,
                 n_jobs=1,
                 **lgb_param_grid,) -> None:
        super().__init__()
        self._num_boost_round = num_boost_round
        self._lgb_param_grid = {"verbose" : verbose, 
                                "early_stopping_rounds" : early_stopping_rounds,
                                "random_state" : random_state,
                                "n_jobs" : n_jobs,
                                **lgb_param_grid}
        self._callbacks = [
                            # log_evaluation(period=log_period), 
                            # early_stopping(stopping_rounds=stopping_rounds)
                           ]
        self._random_state = random_state
        
    def fit(self, X, y, validation_set = None, val_size=0.05):
        if validation_set is None:
            X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                                  y, 
                                                                  test_size=val_size, 
                                                                  random_state=self._random_state)
        else:
            X_train, y_train = X, y
            X_valid, y_valid = validation_set

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
        self.model = lgb.train(self._lgb_param_grid,
                                train_data,
                                num_boost_round=self._num_boost_round,
                                valid_sets=[train_data, val_data],
                                valid_names=['train', 'valid'],
                                callbacks=self._callbacks,
                                )
        return self

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred
    

from typing import List
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool, train


class CatBoostModel():

    def __init__(self,
                 num_boost_round=100,
                 early_stopping_rounds=20,
                 random_state=129,
                 predict_type='prob',
                 verbose=False,
                 **catboost_params) -> None:
        super().__init__()
        self._num_boost_round = num_boost_round
        self._stopping_rounds = early_stopping_rounds
        self._catboost_params = {"random_seed" : random_state,
                                 "verbose" : verbose,
                                 **catboost_params}
        self._random_state = random_state
        self._predict_type = predict_type

    def fit(self, X, y, validation_set = None, val_size=0.05):
        self.cat_col_names = [col_name for col_name in X.columns if X[col_name].dtype == 'category']

        if validation_set is None:
            X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                                  y, 
                                                                  test_size=val_size, 
                                                                  random_state=self._random_state)
        else:
            X_train, y_train = X, y
            X_valid, y_valid = validation_set
            
        train_pool = Pool(data=X_train, label=y_train, cat_features=self.cat_col_names)
        val_pool = Pool(data=X_valid, label=y_valid, cat_features=self.cat_col_names)
        self.model = cat.train(
                params=self._catboost_params,           
                dtrain=train_pool,   
                eval_set=val_pool,
                num_boost_round=self._num_boost_round,   
                early_stopping_rounds=self._stopping_rounds  
                )
        return self

    def predict(self, X):
        test_pool = Pool(data=X, cat_features=self.cat_col_names)
        if self._predict_type == "prob":
            y_pred = self.model.predict(test_pool, prediction_type='Probability')[:, 1]
        elif self._predict_type == "class":
            y_pred = self.model.predict(test_pool, prediction_type='Class')
        else:
            y_pred = self.model.predict(test_pool)
        return y_pred
    
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb


class XGBoostModel():

    def __init__(self,
                 eval_verbosity=False,
                 num_boost_round=100,
                 early_stopping_rounds=20,
                 random_state=129,
                 verbosity=0,
                 n_jobs=1,
                 **xgb_param_grid) -> None:
        super().__init__()
        self._eval_verbosity = eval_verbosity
        self._num_boost_round = num_boost_round
        self._early_stopping_rounds = early_stopping_rounds
        self._xgb_param_grid = {"verbosity" : verbosity,
                                "random_state" : random_state,
                                "n_jobs" : n_jobs,
                                **xgb_param_grid}
        self._random_state = random_state
    
    def fit(self, X, y, validation_set = None, val_size=0.05):
        if validation_set is None:
            X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                                  y, 
                                                                  test_size=val_size, 
                                                                  random_state=self._random_state)
        else:
            X_train, y_train = X, y
            X_valid, y_valid = validation_set
        train_data = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        valid_data = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
        eval_data = [(train_data, 'train'), (valid_data, 'eval')]
    
        self.model = xgb.train(
            self._xgb_param_grid, 
            train_data, 
            evals=eval_data,                       
            early_stopping_rounds=self._early_stopping_rounds,   
            num_boost_round=self._num_boost_round,        
            verbose_eval=self._eval_verbosity                 
        )
        return self

    def predict(self, X):
        test_data = xgb.DMatrix(X, enable_categorical=True)
        y_pred = self.model.predict(test_data)
        return y_pred
    

from sklearn.preprocessing import MinMaxScaler, TargetEncoder
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor


class HGBModel():

    def __init__(self,
                 smooth="auto", 
                 target_type="continuous",
                 num_boost_round=100,
                 early_stopping=True,
                 validation_fraction=0.05,
                 early_stopping_rounds=20,
                 verbose=0,
                 random_state=129,
                 **hgb_params) -> None:
        hgb_params = {"max_iter" : num_boost_round,
                      "early_stopping" : early_stopping,
                      "validation_fraction" : validation_fraction,
                      "n_iter_no_change" : early_stopping_rounds,
                      "verbose" : verbose,
                      "random_state" : random_state,
                      "categorical_features" : "from_dtype",
                      **hgb_params}
        
        self._target_enc = TargetEncoder(target_type=target_type, 
                                         smooth=smooth, 
                                         random_state=random_state)
        self._target_type = target_type
        if target_type == "continuous":
            self.model = HistGradientBoostingRegressor(**hgb_params)
        else:
            self.model = HistGradientBoostingClassifier(**hgb_params)
        
    
    def fit(self, X, y, validation_set=None, **kwargs):
        categorical_features = [col_name for col_name in X.columns if X[col_name].dtype == 'category']
        target_enc_values = self._target_enc.fit_transform(X[categorical_features], y)
        X = X.drop(columns=categorical_features)
        X[categorical_features] = target_enc_values
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        categorical_features = [col_name for col_name in X.columns if X[col_name].dtype == 'category']
        target_enc_values = self._target_enc.transform(X[categorical_features])
        X = X.drop(columns=categorical_features)
        X[categorical_features] = target_enc_values
        if self._target_type == "continuous":
            y_pred = self.model.predict(X)
        else:
            y_pred = self.model.predict_proba(X)[:, 1]
        return y_pred

In [8]:
from enum import Enum
from functools import reduce
from typing import Any, Dict, List
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error




class SupportedModelTypes(Enum):
    LGBM = LGBMModel
    CAT = CatBoostModel
    XGB = XGBoostModel
    HGB = HGBModel
    # PYTORCH = PytorchFFNModel
    # KERAS_EMB = KerasEmbeddingModel
    # KERAS_FM = KerasFM
    # TABNET = TabNetModel 

class SupportedClassificationParams(Enum):
    LGBM = {'objective' : 'binary', 'metric' : 'binary_logloss'}
    CAT = {'loss_function':'Logloss', 'eval_metric' : "AUC"}
    XGB = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
    HGB = {'target_type' : 'binary'}
    # PYTORCH = {}
    # KERAS_EMB = {}
    # KERAS_FM = {}
    # TABNET =  {}

class SupportedRegressionParams(Enum):
    LGBM = {'objective': 'regression', 'metric': 'rmse'}
    CAT = {"loss_function" : "RMSE", 'eval_metric': 'RMSE'}
    XGB = {"objective": "reg:squarederror", "eval_metric": "rmse"}
    HGB = {"target_type" : "continuous"}
    # PYTORCH = {"loss": nn.MSELoss(), "metric_callable": mean_squared_error}
    # KERAS_EMB = {}
    # KERAS_FM = {}
    # TABNET =  {}
    

class KToolsTrainer:

    def __init__(self,
                 model_type : str,
                 task : str,
                 model_parameters : Dict[str, Any],
                 kfold_object,
                 train_csv_path : str,
                 test_csv_path : str,
                 sample_csv_path : str,
                 target_col_name : str,
                 model_name : str = None,
                 output_file_path : str = None,
                 data_transforms : List[Any] = [FillNullValues.transform,
                                                ConvertObjectToCategorical.transform],
                 eval_metric : callable = None,
                 verbose : bool = False
                 ) -> None:
        self._model_type = model_type.upper()
        self.model_name = model_name
        self._task = task.upper()
        self._model_parameters = model_parameters
        self._kfold_object = kfold_object
        self._eval_metric = eval_metric
        self._verbose = verbose
        self._data_transforms = data_transforms
        self._train_csv_path = train_csv_path
        self._test_csv_path = test_csv_path
        self._sample_csv_path = sample_csv_path
        self._target_col_name = target_col_name
        self._output_file_path = output_file_path
        self.model = self._setup_model()
        self.train_df, self.test_df = self._setup_dataset()

    def _setup_model(self):
        model_class_obj = SupportedModelTypes[self._model_type].value
        if self._task == "BINARY":
            task_params = SupportedClassificationParams[self._model_type].value
        elif self._task == "REGRESSION":
            task_params = SupportedRegressionParams[self._model_type].value
        else:
            raise NotImplementedError
        
        self._model_parameters.update(task_params)
        return model_class_obj(**self._model_parameters)
    
    def _setup_dataset(self):
        settings = DataSciencePipelineSettings(self._train_csv_path,
                                               self._test_csv_path,
                                               self._target_col_name,
                                               )

        settings = reduce(lambda acc, func: func(acc), self._data_transforms, settings)
        train_df, test_df = settings.update()
        test_df.drop(columns=[self._target_col_name], inplace=True)
        return train_df, test_df

    def fit_predict(self):
        
        X, y = self.train_df.drop(columns=self._target_col_name), self.train_df[[self._target_col_name]]
        score_tuple, oof_predictions, model_list = CrossValidationExecutor(self.model,
                                                                           self._eval_metric,
                                                                           self._kfold_object,
                                                                           verbose=2
                                                                           ).run(X, y)
        
        num_splits = self._kfold_object.get_n_splits()
        test_predictions = np.zeros(self.test_df.shape[0])
        for model in model_list:
            test_predictions += model.predict(self.test_df)/num_splits

        self.model_name = str(self.model) if self.model_name is None else self.model_name
        if self._output_file_path is not None:
            pd.Series(oof_predictions).to_csv(self._output_file_path + self.model_name + "_oofs.csv")
            pd.Series(test_predictions).to_csv(self._output_file_path + self.model_name + "_test.csv")

            sample_sub = pd.read_csv(self._sample_csv_path)
            sample_sub.iloc[:, 1] =  test_predictions
            sample_sub.to_csv(f"{self.model_name}_submission.csv", index=False)
            sample_sub.head()

In [None]:
cat_gpu_params = {"loss_function" : "RMSE", 'eval_metric': 'RMSE', 'task_type' : "GPU", 'devices' : '0,1', "bootstrap_type" : "Bayesian", "num_boost_round" : 10000, "early_stopping_rounds" : 20, 'max_bin': 119, 'learning_rate': 0.03544042692533354, 'depth': 11, 'bagging_temperature': 0.274090640514847, 'min_data_in_leaf': 322.15353586384697, 'l2_leaf_reg': 2.683155590343877e-05, 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton'}
cat_lg_params = {"loss_function" : "RMSE", 'eval_metric': 'RMSE', "grow_policy" : "Lossguide", "num_boost_round" : 1500, "early_stopping_rounds" : 10, 'max_bin': 429, 'learning_rate': 0.0444690180044118, 'depth': 16, 'bagging_temperature': 46.938242378130056, 'subsample': 0.9822763758421824, 'colsample_bylevel': 0.9965797729601968, 'min_data_in_leaf': 200.85284748997194, 'l2_leaf_reg': 0.028938997264293807, 'leaf_estimation_iterations': 1, 'random_strength': 3.568433370233164, 'leaf_estimation_method': 'Gradient'}

In [None]:
kf = KFold(5, shuffle=True, random_state=42)

transforms = [
            FillNullValues.transform,
            CreateYuweiFeatures.transform,
            ConvertObjectToCategorical.transform,
            LogTransformTarget.transform,
            ]

cat = KToolsTrainer('cat',
                    'regression',
                    {'predict_type' : 'else'},
                    kf,
                    train_csv_path,
                    test_csv_path,
                    sample_sub_csv_path,
                    target_col_name,
                    model_name='cat_with_fe_tuned',
                    output_file_path='data/insurance/oofs/',
                    eval_metric=root_mean_squared_error,
                    data_transforms=transforms,
                    )

cat.fit_predict()