In [None]:
!pip install pytorch-tabnet

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, root_mean_squared_error, root_mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from functools import reduce
from dataclasses import dataclass
import os
from scipy.stats import ks_2samp
from typing import *
from copy import deepcopy

In [None]:
def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [None]:
from typing import List
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor


class TabNetModel():

    def __init__(self,
                 cat_idcs : List[int],
                 cat_dims : List[int],
                 cat_emb_dims : List[int] = None,
                 eval_metric : List[str] = None,
                 batch_size : int = 1024,
                 virtual_batch_size : int = 128,
                 num_workers : int = 0,
                 drop_last : bool = False,
                 max_epochs : int = 200,
                 patience : int = 50,
                 random_state=129,
                 verbose=0,
                 seed=0,
                 task : str = "regression",
                 **tabnet_params) -> None:
        super().__init__()

        cat_emb_dims = cat_emb_dims if cat_emb_dims is not None else [int(math.sqrt(x)) for x in cat_dims]
        self._batch_size = batch_size
        self._virtual_batch_size = virtual_batch_size
        self._num_workers = num_workers
        self._drop_last = drop_last
        self._max_epochs = max_epochs
        self._patience = patience
        self._random_state = random_state
        self._verbose = verbose
        self._seed = seed
        self._task = task

        if task == "binary":
            self.model = TabNetClassifier(
                                        cat_idxs=cat_idcs,
                                        cat_dims=cat_dims,
                                        cat_emb_dim=cat_emb_dims,
                                        verbose=verbose,
                                        seed=seed,
                                        **tabnet_params
                                        )
            self._eval_metric = eval_metric if eval_metric is not None else ['auc']

        elif task == "regression":
            self.model = TabNetRegressor(
                                        cat_idxs=cat_idcs,
                                        cat_dims=cat_dims,
                                        cat_emb_dim=cat_emb_dims,
                                        verbose=verbose,
                                        seed=seed,
                                        **tabnet_params
                                        )
            self._eval_metric = eval_metric if eval_metric is not None else ['rmse']
    
    def fit(self, X, y, validation_set = None, val_size=0.05):
        if validation_set is None:
            X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                                  y, 
                                                                  test_size=val_size, 
                                                                  random_state=self._random_state)
        else:
            X_train, y_train = X, y
            X_valid, y_valid = validation_set

        X_train, X_valid, y_train, y_valid = X_train.values, X_valid.values, y_train.values.squeeze(), y_valid.values.squeeze()
        self.model.fit(
                        X_train=X_train, y_train=y_train,
                        eval_set=[(X_valid, y_valid)],
                        eval_name=['val'],
                        eval_metric=self._eval_metric,  
                        batch_size=self._batch_size,
                        virtual_batch_size=self._virtual_batch_size,
                        num_workers=self._num_workers,
                        drop_last=self._drop_last,
                        max_epochs=self._max_epochs,
                        patience = self._patience,
                    )

        return self

    def predict(self, X : pd.DataFrame):
        X = X.values
        if self._task == "regression":
            y_pred = self.model.predict(X)
        elif self._task == "binary":
            y_pred = self.model.predict_proba(X)
        return y_pred

In [None]:
from functools import reduce
from typing import Any, Dict, List, Tuple, Callable
import numpy as np
import pandas as pd
from copy import deepcopy



class CrossValidationExecutor:

    def __init__(self,
                 sklearn_model_instance,
                 evaluation_metric : Callable,
                 kfold_object,
                 use_test_as_valid=True,
                 num_classes=None,
                 verbose=1) -> None:
        
        self.model = sklearn_model_instance
        self._evaluation_metric = evaluation_metric
        self._kf = kfold_object
        self._num_splits = kfold_object.get_n_splits()
        self._use_test_as_valid = use_test_as_valid
        self._num_classes = num_classes
        self._verbose = verbose

    def run(self, X, y, additional_data=None, local_transform_list=[lambda x : x], output_transform_list=[lambda x : x]) -> Tuple[Tuple[float], np.ndarray, List[Any]]:
        if additional_data is not None:
            X_add, y_add = additional_data
            pd.testing.assert_index_equal(X.columns, X_add.columns, check_exact=True)
            pd.testing.assert_series_equal(X.dtypes, X_add.dtypes, check_exact=True)
            pd.testing.assert_index_equal(y.columns, y_add.columns, check_exact=True)
            pd.testing.assert_series_equal(y.dtypes, y_add.dtypes, check_exact=True)

        cv_results = []
        model_list = []
        oof_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))
        metric_predictions = np.zeros(y.shape[0]) if self._num_classes is None else np.zeros((y.shape[0], self._num_classes))

        for i, (train_index, val_index) in enumerate(self._kf.split(X, y)):
            
            X_train, X_test = X.iloc[train_index], X.iloc[val_index]
            y_train, y_test = y.iloc[train_index], y.iloc[val_index]

            if additional_data is not None:
                X_train = pd.concat([X_train, X_add], axis=0)
                y_train = pd.concat([y_train, y_add], axis=0)

            X_train, y_train = reduce(lambda acc, func: func(acc), local_transform_list, (X_train, y_train))
            validation_set = None
            if self._use_test_as_valid:
                validation_set = [X_test, y_test]

            model = deepcopy(self.model).fit(X_train, y_train, validation_set=validation_set)
            model_list += [model]
            y_pred = model.predict(X_test)
            y_pred_processed = reduce(lambda acc, func: func(acc), output_transform_list, y_pred)
            
            cv_results += [self._evaluation_metric(y_test, y_pred_processed)]
            oof_predictions[val_index] = y_pred
            metric_predictions[val_index] = y_pred_processed

            if self._verbose > 1:
                print(f"The CV results of the current fold is {cv_results[-1]}")

        oof_score = self._evaluation_metric(y, metric_predictions)
        mean_cv_score = np.mean(cv_results)
        score_tuple = (oof_score, mean_cv_score)

        if self._verbose > 0:
            print("#"*100)
            print("OOF prediction score : ", oof_score)
            print(f"Mean {self._num_splits}-cv results : {mean_cv_score} +- {np.std(cv_results)}")
            print("#"*100)

        return score_tuple, oof_predictions, model_list

In [None]:
class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df[settings.target_col_name] = np.log1p(settings.combined_df[settings.target_col_name])
        return settings

class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    
class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings

In [None]:
settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name
                                        )
transforms = [
            LogTransformTarget.transform,
            FillNullValues.transform,
            # CreateYuweiFeatures.transform,
            ConvertObjectToCategorical.transform,
            ]

settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)
X, y = train.drop(columns=target_col_name), train[[target_col_name]]

In [None]:
cat_idcs = [i for i in X.columns if X.columns[i].dtype == 'category']
cat_dims = [test_df.max()[i] for i in cat_idcs]
cat_dims = [int(x) for x in cat_dims]

params = {"cat_idcs" : cat_idcs, "cat_dims" : cat_dims, "task" : "binary", "patience" : 5, "n_d" : 16, "n_a" : 16, "verbose" : 1, "eval_metric" : ['accuracy']}

kf = KFold(5, shuffle=True, random_state=42)

cve = CrossValidationExecutor(TabNetModel(**params),
                              accuracy_score,
                              kf,
                              verbose=2
                              ).fit(X, y)