In [None]:
from functools import reduce
import os
import random
from typing import Any, Dict, List, Union
from autogluon.tabular import TabularPredictor
import torch
# from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
# from ktools.preprocessing.basic_feature_transformers import *
from copy import deepcopy
from dataclasses import dataclass
from typing import *
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# preamble

In [None]:
def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [None]:
class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    

class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings
    
class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df[settings.target_col_name] = np.log1p(settings.combined_df[settings.target_col_name])
        return settings

In [None]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/train.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/insurance/test.csv"
target_col_name = "Premium Amount"

In [None]:
class KToolsAutogluonWrapper:

    def __init__(self,
                 train_csv_path : str,
                 test_csv_path : str,
                 target_col_name : str,
                 kfold_object,
                 data_transforms : List[Any] = [FillNullValues.transform,
                                                ConvertObjectToCategorical.transform],
                 included_model_types : List[str] = ['CAT', 'GBM', 'XGB'],
                 ag_name : str = None,
                 eval_metric : str = "accuracy",
                 problem_type : str = "binary",
                 time_limit : float = 3600*11,
                 random_state : int = 42,
                 autogluon_kwargs : Dict[str, Any] = {"verbosity":2,
                                                      "num_cpus":4,
                                                      "num_gpus":2},
                 save_predictions : bool = True,
                 save_path : str = ""
                 ) -> None:
        self._train_csv_path = train_csv_path
        self._test_csv_path = test_csv_path
        self._target_col_name = target_col_name
        self._kfold_object = kfold_object
        self._data_transforms = data_transforms
        self._included_model_types = included_model_types
        self.ag_name = ag_name if ag_name is not None else '_'.join(included_model_types)
        self._eval_metric = eval_metric
        self._problem_type = problem_type
        self._time_limit = time_limit
        self._random_state = random_state
        self._autogluon_kwargs = autogluon_kwargs
        self._oof_save_path = os.path.join(save_path, f"{self.ag_name}_oof.csv")
        self._test_save_path = os.path.join(save_path, f"{self.ag_name}_test.csv")
        self._save_predictions = save_predictions
        self._set_random_seeds()
        self.train_df, self.test_df, self.model = self._setup()


    def _set_random_seeds(self):
        np.random.seed(self._random_state)
        random.seed(self._random_state)
        torch.manual_seed(self._random_state)

    def _setup(self):
        
        kfold_col_name = "fold"
        settings = DataSciencePipelineSettings(self._train_csv_path,
                                               self._test_csv_path,
                                               self._target_col_name,
                                               )

        settings = reduce(lambda acc, func: func(acc), self._data_transforms, settings)
        train_df, test_df = settings.update()
        test_df.drop(columns=[self._target_col_name], inplace=True)
        X, y = train_df.drop(columns=self._target_col_name), train_df[[self._target_col_name]]

        split = self._kfold_object.split(X, y)
        for i, (_, val_index) in enumerate(split):
            train_df.loc[val_index, kfold_col_name] = i

        predictor = TabularPredictor(label=self._target_col_name,
                                     eval_metric=self._eval_metric,
                                     problem_type=self._problem_type,
                                     groups=kfold_col_name
                                     )
        
        return train_df, test_df, predictor

    def fit(self):
        self.model = self.model.fit(self.train_df,
                                    presets='best_quality',
                                    time_limit=self._time_limit,
                                    included_model_types=self._included_model_types,
                                    **self._autogluon_kwargs
                                    )
        return self
    
    def predict(self, df : Union[pd.DataFrame, None] = None):
        if df is not None:
            all_y_preds = self.model.predict_multi(df)
            all_y_preds = pd.DataFrame.from_dict(all_y_preds)
            if self._save_predictions: all_y_preds.to_csv(self._test_csv_path)
        else:
            all_y_preds = self.model.predict_multi()
            all_y_preds = pd.DataFrame.from_dict(all_y_preds)
            if self._save_predictions: all_y_preds.to_csv(self._oof_save_path)

        return all_y_preds

In [None]:
kf = KFold(5, shuffle=True, random_state=42)
ktools_ag_model = KToolsAutogluonWrapper(train_csv_path,
                                        test_csv_path,
                                        target_col_name,
                                        kf,
                                        data_transforms = [
                                                           LogTransformTarget.transform,
                                                           FillNullValues.transform, 
                                                           ConvertObjectToCategorical.transform],
                                        eval_metric="root_mean_squared_error",
                                        problem_type="regression",
                                        time_limit=3600,
                                   #      autogluon_kwargs={ "verbosity": 2,"num_cpus": 1},
                                        save_predictions=False,
                                        save_path="/kaggle/working/"
                                        ).fit()