# Data Preparation utilities

In [9]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

In [10]:
def show_number_of_missing(df):
    for column in df.columns:
        missingvalues = df.loc[ df[column].isnull() ].shape[0]
        print(f'{column}: {missingvalues} value(s) missing')

In [11]:
def x_y_split(df, label_column):
    x = df.drop(label_column, axis=1)  # features - all columns, without label
    y = df[label_column]  # Label
    return x, y

In [12]:
def add_categ_numeric_features_to_list(df):
    categorical_features = []
    numeric_features = []

    for column in df.columns:
        # if column is float or integer type
        if df[column].dtype.kind in 'fi':
            numeric_features.append(column)
        else:
            categorical_features.append(column)

    for cat_col in categorical_features:
        print(f"Unique values for '{cat_col}': {df[cat_col].unique()}")

    return categorical_features, numeric_features

# Transformers

In [13]:
transformer_imp_unknown = SimpleImputer(strategy = 'constant', fill_value = 'unknown')

In [14]:
transformer_imp_mean = SimpleImputer(strategy = 'mean')

In [15]:
def create_col_trans(name, transformer, columnamelist):
    column_transformer = ColumnTransformer(transformers = [ (name, transformer, columnamelist) ],
        remainder = 'passthrough', verbose_feature_names_out = False)

    column_transformer.set_output(transform = 'pandas')

    return column_transformer

In [16]:
def transformer_ordinal(categorylist):
    transformer_ordinal = OrdinalEncoder(categories = categorylist, handle_unknown='use_encoded_value', unknown_value=-1)

    return transformer_ordinal

In [17]:
def transformer_nominal(known_categories):
    transformer_nominal = OneHotEncoder(categories = known_categories, handle_unknown = 'ignore', sparse_output = False)

    return transformer_nominal

In [18]:
transformer_scale_0_1 = MinMaxScaler(feature_range = (0, 1))