# Understanding types of features

The goal of this notebook is to automatically predict the type of each feature. These types/labels will be used during the creation of the KST model.

In [None]:
%load_ext nb_black

In [None]:
import os, sys
import pandas as pd
import numpy as np


from ipywidgets import interact, interactive, Layout, IntSlider

In [None]:
from kuberspatiotemporal import CompoundModel, Feature, SpatialModel, KuberModel
from kuberspatiotemporal.tools import make_ellipses

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer

In [None]:
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
from sklearn.metrics import confusion_matrix

# 1. Import data

In [None]:
df = pd.read_csv(
    "/Users/adrianai/Desktop/aiml/aiml-dbfp-anomaly/utils/user_dbfp_mfa.csv"
)

In [None]:
df.columns

## How to compute a KST model

1. Feature selection

https://towardsdatascience.com/a-feature-selection-tool-for-machine-learning-in-python-b64dd23710f0

In [None]:
class FeatureSelector:
    """
    Class that will perform feature selection as a preprocessing step.
    
    """

    def __init__(self, data):

        self.data = data
        self.base_features = data.columns

        self.categorical_features = None
        self.numerical_features = None
        self.time_feature = None

        self.missings_threshold = 0.3
        self.missings = None
        self.single_uniques = None

        self.selected_features = None

    def get_col_dtype(self, col):
        """
        Infer datatype of a pandas column. 

        col: a pandas Series representing a df column. 
        """
        try:
            col.infer_objects().dtypes == "datetime64[ns, UTC]"
            return "time"
        except:
            try:
                pd.to_numeric(col)
                if np.array_equal(col, col.astype(int)):
                    return "cat"
                else:
                    return "num"
            except:
                return "cat"

    def get_data_dtypes(self):
        """
        Infer datatypes of data frame columns. 

        """
        data_dtypes = np.array(
            [
                [col, self.get_col_dtype(self.data.loc[:, col])]
                for col in self.data.columns
            ]
        )

        self.categorical_features = data_dtypes[
            np.where(data_dtypes[:, 1] == "cat")[0], 0
        ]
        self.numerical_features = data_dtypes[
            np.where(data_dtypes[:, 1] == "num")[0], 0
        ]
        self.time_feature = data_dtypes[np.where(data_dtypes[:, 1] == "time")[0], 0]

    def identify_missings(self):
        """
        Return features that have a missing rate higher than threshold.

        """

        missings = (
            self.data.isnull().sum(axis=0) / len(self.data) > self.missings_threshold
        )

        self.missings = np.array(missings.index[missings])

    def identify_single_unique(self):
        """
        Return categorical features that have a unique value.

        """

        singles = self.data.loc[:, self.categorical_features].nunique() == 1

        self.single_uniques = np.array(singles.index[singles])

    def select(self):
        """
        Selects features on DataFrame based on missing rate and number of categories.
        """
        self.get_data_dtypes()
        self.identify_missings()
        self.identify_single_unique()

        features_to_remove = np.unique(
            np.concatenate((self.single_uniques, self.missings))
        )

        self.categorical_features = np.array(
            list(
                set(self.categorical_features).symmetric_difference(
                    np.intersect1d(self.categorical_features, features_to_remove)
                )
            )
        )
        self.numerical_features = np.array(
            list(
                set(self.numerical_features).symmetric_difference(
                    np.intersect1d(self.numerical_features, features_to_remove)
                )
            )
        )
        self.time_feature = np.array(
            list(
                set(self.time_feature).symmetric_difference(
                    np.intersect1d(self.time_feature, features_to_remove)
                )
            )
        )

    def get_categories(self, col):
        """
        Returns categories for a specific categorical feature.
        
        """
        if col not in self.categorical_features:
            print("Not a valid column.")
        else:
            return np.sort(self.data.loc[:, col].dropna().unique())

## Test Feature Selector

In [None]:
app_id = 1
data = df[df["oauth_application_id"] == app_id]

In [None]:
fs = FeatureSelector(data)

In [None]:
# Check if the base features correspond to the original columns
len(fs.base_features) == len(data.columns)

In [None]:
# get data types
fs.get_data_dtypes()
init_cat = len(fs.categorical_features)
init_num = len(fs.numerical_features)
init_time = len(fs.time_feature)

(init_cat, init_num, init_time)

In [None]:
init_cat + init_num + init_time == len(data.columns)

In [None]:
# after selection
fs.identify_missings()
missings_count = len(fs.missings)

fs.identify_single_unique()
single_unique_count = len(fs.single_uniques)

In [None]:
(missings_count, single_unique_count)

In [None]:
fs.select()

In [None]:
after_select_cat = len(fs.categorical_features)
after_select_num = len(fs.numerical_features)
after_select_time = len(fs.time_feature)

(after_select_cat, after_select_num, after_select_time)

In [None]:
after_select_cat + after_select_num + after_select_time == len(data.columns) - len(
    np.unique(np.concatenate((fs.missings, fs.single_uniques)))
)

In [None]:
data_ = data[
    np.concatenate((fs.categorical_features, fs.numerical_features, fs.time_feature))
]

In [None]:
data.head()