In [None]:
from functools import reduce
import os
import random
from typing import Any, List, Union
import torch
from abc import ABC, abstractmethod
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from dataclasses import dataclass
from copy import deepcopy
from functools import reduce
import os
import random
from typing import Any, List, Union
import torch
from abc import ABC, abstractmethod
from sklearn.model_selection import KFold

In [None]:
def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [None]:
class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    

class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings
    
class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df[settings.target_col_name] = np.log1p(settings.combined_df[settings.target_col_name])
        return settings

In [None]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/train.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/test.csv"
target_col_name = "Premium Amount"

In [None]:
class IAutomlWrapper(ABC):

    def __init__(self,
                 train_csv_path : str,
                 test_csv_path : str,
                 target_col_name : str,
                 kfold_object,
                 data_transforms : List[Any] = [FillNullValues.transform,
                                                ConvertObjectToCategorical.transform],
                 model_name : Union[str, None] = None,
                 random_state : int = 42,
                 save_predictions : bool = True,
                 save_path : str = ""
                 ) -> None:
        
        self._train_csv_path = train_csv_path
        self._test_csv_path = test_csv_path
        self._target_col_name = target_col_name
        self._kfold_object = kfold_object
        self._data_transforms = data_transforms
        self._random_state = random_state
        self._save_predictions = save_predictions
        self._save_path = save_path
        self._set_random_seeds()
        self._set_model_name_and_save_paths(model_name)
        self.train_df, self.test_df = self._data_setup()
        self.model = self._model_setup()

    def _set_random_seeds(self):
        np.random.seed(self._random_state)
        random.seed(self._random_state)
        torch.manual_seed(self._random_state)

    def _data_setup(self):
        settings = DataSciencePipelineSettings(self._train_csv_path,
                                               self._test_csv_path,
                                               self._target_col_name,
                                               )

        settings = reduce(lambda acc, func: func(acc), self._data_transforms, settings)
        train_df, test_df = settings.update()
        test_df.drop(columns=[self._target_col_name], inplace=True)
        return train_df, test_df

    @abstractmethod
    def _set_model_name_and_save_paths(self, model_name):
        self._model_name = model_name
        self._oof_save_path = os.path.join(self._save_path, f"{model_name}_oof.csv")
        self._test_save_path = os.path.join(self._save_path, f"{model_name}_test.csv")

    @abstractmethod
    def _model_setup(self):
        pass
    
    @abstractmethod
    def fit(self):
        pass
    
    @abstractmethod
    def predict(self, df : Union[pd.DataFrame, None] = None):
        pass

In [None]:
class KToolsLAMAWrapper(IAutomlWrapper):

    def __init__(self,
                 train_csv_path : str,
                 test_csv_path : str,
                 target_col_name : str,
                 kfold_object,
                 task : str = "regression",
                 metric : str = "rmse",
                 time_limit : float = 3600,
                 verbosity : int = 2,
                 lama_models : List[List[str]] = [['lgb', 'lgb_tuned', 'cb', 'cb_tuned']],
                 data_transforms : List[Any] = [FillNullValues.transform,
                                                ConvertObjectToCategorical.transform],
                 model_name : Union[str, None] = None,
                 random_state : int = 42,
                 save_predictions : bool = True,
                 save_path : str = ""
                 ) -> None:
        self._task = task
        self._metric = metric
        self._time_limit = time_limit
        self._verbosity = verbosity
        self._lama_models = lama_models

        super().__init__(train_csv_path,
                         test_csv_path,
                         target_col_name,
                         kfold_object,
                         data_transforms,
                         model_name,
                         random_state,
                         save_predictions,
                         save_path
                         )
    
    def _set_model_name_and_save_paths(self, model_name):
        self._model_name = model_name if model_name is not None else '_'.join(self._lama_models[0])
        self._oof_save_path = os.path.join(self._save_path, f"{model_name}_lama_oof.csv")
        self._test_save_path = os.path.join(self._save_path, f"{model_name}_lama_test.csv")

    def _model_setup(self) -> TabularAutoML:

        task = Task(self._task, metric=self._metric)
        predictor = TabularAutoML(
            task = task,
            timeout = self._time_limit,
            general_params={"use_algos": self._lama_models})

        return predictor
    
    def fit(self):
        X, y = self.train_df.drop(columns=self._target_col_name), self.train_df[[self._target_col_name]]
        roles = {'target' : self._target_col_name}
        oof_pred = self.model.fit_predict(self.train_df, 
                                          roles = roles, 
                                          verbose = 2,
                                          cv_iter=list(self._kfold_object.split(X, y))
                                          )
        self.oof_pred = pd.Series(oof_pred.data)
        return self
    
    def predict(self, df : Union[pd.DataFrame, None] = None):
        if df is not None:
            all_y_preds = self.model.predict(df)
            all_y_preds = pd.Series(all_y_preds.data)
            if self._save_predictions: all_y_preds.to_csv(self._test_save_path)
        else:
            all_y_preds = self.oof_pred
            if self._save_predictions: all_y_preds.to_csv(self._oof_save_path)
        return all_y_preds

In [None]:
kf = KFold(5, shuffle=True, random_state=42)
ktools_ag_model = KToolsLAMAWrapper(train_csv_path,
                                        test_csv_path,
                                        target_col_name,
                                        kf,
                                        data_transforms = [
                                                           LogTransformTarget.transform,
                                                           FillNullValues.transform, 
                                                           ConvertObjectToCategorical.transform],
                                        eval_metric="root_mean_squared_error",
                                        problem_type="regression",
                                        time_limit=3600,
                                        save_predictions=False,
                                        save_path="/kaggle/working/"
                                        ).fit()