In [None]:
import pandas as pd
import numpy as np
import openfe
from sklearn.metrics import roc_auc_score, accuracy_score, root_mean_squared_error, root_mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.create_oof_from_model import create_oofs_from_model
from ktools.modelling.models.catboost_model import CatBoostModel
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from functools import reduce

from dataclasses import dataclass
import os
from scipy.stats import ks_2samp
from typing import Dict, List
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.models.lgbm_model import LGBMModel
from ktools.preprocessing.basic_feature_transformers import *
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings

In [None]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/train.csv"
original_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/original.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/test.csv"
sample_sub_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/sample_submission.csv"
target_col_name = "Premium Amount"

In [None]:
def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [None]:
class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df[settings.target_col_name] = np.log1p(settings.combined_df[settings.target_col_name])
        return settings
    
class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    
class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings
    
class CreateDateTimeColumns():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df['year'] = pd.to_datetime(settings.combined_df['Policy Start Date']).dt.year
        settings.combined_df['month'] = pd.to_datetime(settings.combined_df['Policy Start Date']).dt.month
        settings.combined_df['day'] = pd.to_datetime(settings.combined_df['Policy Start Date']).dt.day
        return settings

In [None]:
from functools import reduce
from typing import Any, Dict, List, Tuple
import numpy as np
import pandas as pd
from copy import deepcopy


class CrossValidationExecutor:

    def __init__(self,
                 sklearn_model_instance,
                 evaluation_metric : callable,
                 kfold_object,
                 use_test_as_valid=True,
                 num_classes=None,
                 verbose=1) -> None:
        
        self.model = sklearn_model_instance
        self._evaluation_metric = evaluation_metric
        self._kf = kfold_object
        self._num_splits = kfold_object.get_n_splits()
        self._use_test_as_valid = use_test_as_valid
        self._num_classes = num_classes
        self._verbose = verbose

    def run(self, X, y, additional_data=None, local_transform_list=[lambda x : x], output_transform_list=[lambda x : x]) -> Tuple[Tuple[float], np.ndarray, List[Any]]:
        if additional_data is not None:
            X_add, y_add = additional_data
            pd.testing.assert_index_equal(X.columns, X_add.columns, check_exact=True)
            pd.testing.assert_series_equal(X.dtypes, X_add.dtypes, check_exact=True)
            pd.testing.assert_index_equal(y.columns, y_add.columns, check_exact=True)
            pd.testing.assert_series_equal(y.dtypes, y_add.dtypes, check_exact=True)

        cv_results = []
        model_list = []
        oof_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))
        metric_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))

        for i, (train_index, val_index) in enumerate(self._kf.split(X, y)):
            
            X_train, X_test = X.iloc[train_index], X.iloc[val_index]
            y_train, y_test = y.iloc[train_index], y.iloc[val_index]

            if additional_data is not None:
                X_train = pd.concat([X_train, X_add], axis=0)
                y_train = pd.concat([y_train, y_add], axis=0)

            X_train, y_train = reduce(lambda acc, func: func(acc), local_transform_list, (X_train, y_train))
            validation_set = None
            if self._use_test_as_valid:
                validation_set = [X_test, y_test]

            model = deepcopy(self.model).fit(X_train, y_train, validation_set=validation_set)
            model_list += [model]
            y_pred = model.predict(X_test)
            y_pred_processed = reduce(lambda acc, func: func(acc), output_transform_list, y_pred)
            
            cv_results += [self._evaluation_metric(y_test, y_pred_processed)]
            oof_predictions[val_index] = y_pred
            metric_predictions[val_index] = y_pred_processed

            if self._verbose > 1:
                print(f"The CV results of the current fold is {cv_results[-1]}")

        oof_score = self._evaluation_metric(y, metric_predictions)
        mean_cv_score = np.mean(cv_results)
        score_tuple = (oof_score, mean_cv_score)

        if self._verbose > 0:
            print("#"*100)
            print("OOF prediction score : ", oof_score)
            print(f"Mean {self._num_splits}-cv results : {mean_cv_score} +- {np.std(cv_results)}")
            print("#"*100)

        return score_tuple, oof_predictions, model_list

In [None]:
class RobustFeatureImportanceChecker:

    def __init__(self,
                 train_csv_path : str,
                 test_csv_path : str,
                 target_col_name : str,
                 feature_creator,
                 kfold_object,
                 metric_callable : callable,
                 metric_direction : str = "maximize",
                 model_type : str = 'lgbm',
                 model_params : Dict = {},
                 sqrt_population_size : int = 10,
                 result_path : str = None,
                 initial_transform_list : List[callable] = [FillNullValues.transform,
                                                            ConvertObjectToCategorical.transform],
                 original_csv_path : str = None
                 ) -> None:
        self._train_csv_path = train_csv_path
        self._test_csv_path = test_csv_path
        self._target_col_name = target_col_name
        self._feature_creator = feature_creator
        self._kfold_object = kfold_object
        self._metric_callable = metric_callable
        self._metric_direction = metric_direction
        self._model_type = model_type
        self._model_params = model_params
        self._sqrt_population_size = sqrt_population_size
        self._result_path = result_path
        self._initial_transform_list = initial_transform_list
        self._original_csv_path = original_csv_path
        combined_df, self._added_feature_names, self._original_col_names = self._setup()
        self._train_df, self._test_df = combined_df.loc['train'], combined_df.loc['test']


    def _setup(self):
        settings = DataSciencePipelineSettings(self._train_csv_path,
                                                self._test_csv_path,
                                                self._target_col_name,
                                                original_csv_path=self._original_csv_path
                                                )
        settings = reduce(lambda acc, func: func(acc), self._initial_transform_list, settings)
        original_col_names = settings.training_col_names
        settings.update()
        combined_df, added_feature_names = self._feature_creator.create(settings.combined_df)
        # combined_df.to_csv("combined_df.csv")
        return combined_df, added_feature_names, original_col_names
    
    def run(self):
        feature_col_name = "added_feature"
        score_col_name = "cv_score"
        significance_col_name = "significance"
        importance_col_name = "important_feature"

        X, y = self._train_df.drop(columns=self._target_col_name), self._train_df[self._target_col_name]
        initial_score_population_path = os.path.join(self._result_path, 'initial_score_population.npy')
        csv_results_path = os.path.join(self._result_path, 'robust_feature_importance.csv')
        history =  pd.read_csv(csv_results_path) if os.path.exists(csv_results_path) else pd.DataFrame({feature_col_name : [], 
                                                                                                        score_col_name : [], 
                                                                                                        significance_col_name : [],
                                                                                                        importance_col_name : []})
        if os.path.exists(initial_score_population_path):
            print("Initial population found")
            initial_score_population = np.load(initial_score_population_path)
        else:
            print("#"*100)
            print("Initial population not found, creating now...")
            print("#"*100)
            all_scores = []
            for model_random_state in range(42, 42+self._sqrt_population_size):
                for cv_random_state in range(42, 42+self._sqrt_population_size):
                    model = self.get_model_instance(model_random_state)
                    self._kfold_object.random_state = cv_random_state
                    score_tuple,_,_ = CrossValidationExecutor(model,
                                                                self._metric_callable,
                                                                self._kfold_object,
                                                                ).run(X[self._original_col_names], y)
                    all_scores += [score_tuple[0]]
            initial_score_population = np.array(all_scores)
            np.save(initial_score_population_path, initial_score_population)
        
        if not (history[feature_col_name] == "original").any():
            new_entry = pd.DataFrame({feature_col_name : ["original"], 
                                      score_col_name : [initial_score_population.mean()], 
                                      significance_col_name : [0],
                                      importance_col_name : [None]
                                      })
            history = pd.concat([history, new_entry])
            history.to_csv(csv_results_path, index=False)
        
        for feature in self._added_feature_names:
            all_scores = []
            for model_random_state in range(1024, 1024+int(self._sqrt_population_size*0.5)):
                for cv_random_state in range(1024, 1024+int(self._sqrt_population_size*0.5)):
                    model = self.get_model_instance(model_random_state)
                    self._kfold_object.random_state = cv_random_state
                    score_tuple,_,_ = CrossValidationExecutor(model,
                                                                self._metric_callable,
                                                                self._kfold_object,
                                                                ).run(X[self._original_col_names + [feature]], y)
                    all_scores += [score_tuple[0]]

            feature_subsample = np.array(all_scores)
            res = ks_2samp(initial_score_population, feature_subsample)

            significance = 0.05
            important = (res.pvalue < significance) & correct_direction
            print("#"*100)
            print("RESULT: ", res)
            print(f"Original mean: {initial_score_population.mean()}, New mean: {feature_subsample.mean()}")
            correct_direction = (initial_score_population.mean() < feature_subsample.mean()) if self._metric_direction == "maximize" else (initial_score_population.mean() > feature_subsample.mean())
            print("CHANGE IS USEFUL: ", important)
            print("#"*100)


            new_entry = pd.DataFrame({feature_col_name : [feature], 
                                      score_col_name : [feature_subsample.mean()], 
                                      significance_col_name : [res.pvalue],
                                      importance_col_name : [important]
                                      })
            history = pd.concat([history, new_entry])
            history.to_csv(csv_results_path, index=False)
    
    def get_model_instance(self, random_state):
        if self._model_type == 'lgbm':
            return LGBMModel(random_state=random_state, colsample_bytree=0.9, subsample=0.9, **self._model_params)
        else:
            raise NotImplementedError

In [None]:
from ktools.experimentation.robust_feature_importance_checker import RobustFeatureImportanceChecker


class CategoricalFeatureCreator():
    def create(self, df : pd.DataFrame):
        target_col_name = "Premium Amount"
        columns = [x for x in df.columns if x != target_col_name]

        added_feats = []
        for col in columns:
            if df[col].dtype != 'category':
                new_col = "cat_" + col
                added_feats += [new_col]
                df[new_col] = df[col].astype('category')

        return df, added_feats


kf = KFold(5, shuffle=True, random_state=42)
checker = RobustFeatureImportanceChecker(train_csv_path,
                                        test_csv_path,
                                        target_col_name,
                                        CategoricalFeatureCreator(),
                                        kf,
                                        root_mean_squared_error,
                                        model_params={'objective': 'regression', 'metric': 'rmse'},
                                        result_path="ktools/experimentation/Tests/TestData/insurance",
                                        initial_transform_list = [LogTransformTarget.transform,
                                                                  CreateDateTimeColumns.transform,
                                                                  FillNullValues.transform, 
                                                                  ConvertObjectToCategorical.transform],
                                        )

checker.run()

Initial population found
####################################################################################################
OOF prediction score :  1.0519621567815338
Mean 5-cv results : 1.0519615523389354 +- 0.0011276972618267532
####################################################################################################
####################################################################################################
OOF prediction score :  1.0519515815271607
Mean 5-cv results : 1.0519487069942672 +- 0.002459213407687471
####################################################################################################
####################################################################################################
OOF prediction score :  1.0519355771339254
Mean 5-cv results : 1.0519339911848267 +- 0.0018266444772135969
####################################################################################################
#################################################