# Feature Engineering and Importance

In [33]:
import pandas as pd
import numpy as np

from typing import Optional

In [34]:
df = pd.DataFrame({
    "Date": pd.to_datetime(["2002/07/22"]),
    "Time": pd.to_datetime(["14:00"]),
})

df.iloc[0, :]

Date   2002-07-22 00:00:00
Time   2023-04-05 14:00:00
Name: 0, dtype: datetime64[ns]

## Outlier Handling

### Custom Transformer

In [35]:
from sklearn.base import TransformerMixin,BaseEstimator
from scipy.stats import iqr, shapiro

class OutlierDetector(TransformerMixin,BaseEstimator):
    """
        Adds a feature IsOutlier for all numeric columns
        Outlier is identified either by using standard deviation or
        IQR
    """
    df = None
    cols = None
    cols_gaussian_info = None # {col: True / False}
    boundaries = {} # {col: [lower,upper]}
    
    def __init__(self,auto_infer_whether_guassian_dist=True, cols_gaussian_info={},gaussian_threshold=2,iqr_threshold=1.5,include=None,exclude=None):
        """
            include : features names to be generated
            exclude : feature names to be excluded while generating all features
            auto_infer_whether_guassian_dist : Automaticaly infer whether the feature follows guassian distribution for each feature (cols_gaussian_info should be empty)
            cols_gaussian_info : Pass column names and boolean value indicating whether the column follows a gaussian distribution or not (auto_infer_whether_guassian_dist should be False if this is not empty)
            Syntax: cols_guassian_info = {'col_A' : True, 'col_B' : False}
            Default value for columns not passed is True (follows gaussian distribution)
        
            Gaussian Threshold: Mean +- Std * threshold
            IQR Threshold: 75th percentile + IQR * threshold
            
            Common Values for Gaussian Threshold: 2, 3
            Common Values for IQR Threshold: 1.5, 3
        """
        self.auto_infer_whether_guassian_dist = auto_infer_whether_guassian_dist
        self.cols_gaussian_info = cols_gaussian_info
        
        if self.auto_infer_whether_guassian_dist and self.cols_gaussian_info != {}:
            raise Exception("Either auto_infer_whether_guassian_dist should be true and cols_gaussian_info should be empty\n or auto_infer_whether_guassian_dist should be false and cols_gaussian_info should be not empty")
        
        if (not self.auto_infer_whether_guassian_dist) and self.cols_gaussian_info == {}:
            raise Exception("Either auto_infer_whether_guassian_dist should be true and cols_gaussian_info should be empty\n or auto_infer_whether_guassian_dist should be false and cols_gaussian_info should be not empty")
        
        self.gaussian_threshold = gaussian_threshold
        self.iqr_threshold = iqr_threshold
        
        self.include = include
        self.exclude = exclude
    
    
    def fit(self,df,y=None):
        self.cols = list(df.columns)  
        
        numeric = df.select_dtypes(include=[int,float],exclude=[bool])
        self.numeric_cols = numeric.columns
        self.included_cols = self.get_cols_needed(self.numeric_cols,self.include,self.exclude)
        
        for col in self.included_cols:
            if self.auto_infer_whether_guassian_dist:
                info = self._is_gaussian(numeric[col])
            else:
                info = self.cols_gaussian_info.get(col,True)
            if info:
                # Follows Gaussian Distribution
                bounds = self._get_gaussian_boundaries(df[col])
            else:
                bounds = self._get_iqr_boundaries(df[col])
            self.boundaries[col] = bounds
        
        return self

    def transform(self,df,*_):
        self.df = df.copy()
        
        for col in self.included_cols:
            bounds = self.boundaries[col]
            self.df[col+"_is_outlier"] = self.df[col].map(lambda x: x < bounds[0] or x > bounds[1])
            
        return self.df
    
    def _is_gaussian(self,col):
        stat, p = shapiro(col)
        if p >= 0.05:
            return True
        return False
    
    def _get_gaussian_boundaries(self,col : pd.Series):
        mean = col.mean()
        three_std = col.std()*self.gaussian_threshold
        lower = mean-three_std
        upper = mean+three_std
        return [lower,upper]
    
    def _get_iqr_boundaries(self,col : pd.Series):
        IQR = iqr(col)
        lower = col.quantile(0.25) - IQR * self.iqr_threshold
        upper = col.quantile(0.75) + IQR * self.iqr_threshold
        return [lower,upper]
            
    def get_feature_names(self):
        return list(self.cols) + [i + "_is_outlier" for i in self.included_cols]

    def get_cols_needed(self,cols,include=None,exclude=None):
        if include == None and exclude == None:
            return cols
        
        if exclude == None:
            return include
        
        if include == None:
            return [col for col in cols if col not in exclude]
        
        raise Exception("Either specify include or exclude or None. Both cannot be specified")
        
    def return_transformed_df(self,df,return_whole_df,drop_original_col,cols,added_cols,processed_cols):
        cols = set(cols)
        cols_to_remove = set(processed_cols)
        
        if return_whole_df:
            
            if drop_original_col:
                cols_to_include = list(cols.difference(processed_cols))
                return df[cols_to_include + added_cols]
            
            return df[list(cols) + added_cols]
        
        if drop_original_col:
            return df[added_cols]
        
        return df[processed_cols + added_cols]

### Using custom transformer

In [36]:
df = pd.read_csv("https://raw.githubusercontent.com/susmitpy/ThakurCollegeTechnicalSeminar/main/titanic_train.csv")

outlier_detector = OutlierDetector(include=["age", "fare"])
outlier_detector.fit(df)
tf_df = outlier_detector.transform(df)

tf_df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,date_time,age_is_outlier,fare_is_outlier
0,1,No,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2022-01-01 00:00:00,False,False
1,2,Yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2022-01-01 09:48:56,False,True
2,3,Yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2022-01-01 19:37:53,False,False
3,4,Yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2022-01-02 05:26:49,False,False
4,5,No,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2022-01-02 15:15:46,False,False


In [37]:
df[["age", "fare"]].describe()

Unnamed: 0,age,fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


In [38]:
pd.concat(
    [
        tf_df[tf_df["age_is_outlier"]][["age"]].head(10).reset_index(drop=True),
        tf_df[tf_df["fare_is_outlier"]][["fare"]].head(10).reset_index(drop=True),
    ],
    axis=1,
)

Unnamed: 0,age,fare
0,66.0,71.2833
1,65.0,263.0
2,59.0,146.5208
3,71.0,82.1708
4,70.5,76.7292
5,61.0,80.0
6,59.0,83.475
7,62.0,73.5
8,63.0,263.0
9,65.0,77.2875


## Encoding Time

### Custom Transformer

In [39]:
from datetime import time

class TimeEncoderProcessor(BaseEstimator, TransformerMixin):
    """
    Encodes times by converting to military time (integer)
    and scaling with max time (2359)
    Encodes missing values by -1
    """

    def fit(self, x: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.applymap(
            lambda x: (x.hour * 100 + x.minute) / 2359 if not pd.isnull(x) else -1
        )

    def inverse_transform(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.applymap(
            lambda x: time(hour=int(x * 2359 / 100), minute=int(x * 2359 % 100))
            if x != -1
            else None
        )

### Using the transformer

In [40]:
df = pd.DataFrame(
            {
                "A": pd.to_datetime(
                    ["15/10/2019 20:00", "16/10/2019 21:45", "17/10/2019 00:00", None],
                    infer_datetime_format=True,
                ),
                "B": pd.to_datetime(
                    ["15/10/2019 08:00", "16/10/2019 09:30", "17/10/2019 15:20", None],
                    infer_datetime_format=True,
                ),
            }
        )

encoder = TimeEncoderProcessor()
tf_df = encoder.transform(df)

In [41]:
tf_df

Unnamed: 0,A,B
0,0.847817,0.339127
1,0.909284,0.394235
2,0.0,0.644341
3,-1.0,-1.0


In [42]:
2000 / 2359

0.8478168715557439

## Generating features from Date

### Custom Transformer

In [43]:
class DateGenProcessor(BaseEstimator, TransformerMixin):
    """
    Creates and return features in the format {col_name}__{gen_name} where gen_name is,
        - DAY: 1 to 31
        - MONTH: 1 to 12
        - YEAR: YYYY (int)
        - WEEKDAY: 0 to 6
        - QUARTER: 0 to 3
        - IS_WEEKEND: 1 / 0
    """

    gen_cols: list[str]  # [X__DAY, X__MONTH ... , Y__DAY, Y_MONTH ...]

    DAY = "__DAY"
    MONTH = "__MONTH"
    YEAR = "__YEAR"
    WEEKDAY = "__WEEKDAY"
    QUARTER = "__QUARTER"
    IS_WEEKEND = "__IS_WEEKEND"
    ATTRS = [DAY, MONTH, YEAR, WEEKDAY, QUARTER, IS_WEEKEND]

    def fit(self, x: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        cols = x.columns.tolist()
        self.gen_cols = [
            DateGenProcessor.get_gen_col_name(col_name=col, attr=attr)
            for col in cols
            for attr in self.ATTRS
        ]
        return self

    @staticmethod
    def get_gen_col_name(col_name: str, attr: str):
        return f"{col_name}{attr}"

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        gen_day = self._gen_day(x=x)
        gen_month = self._gen_month(x=x)
        gen_year = self._gen_year(x=x)
        gen_weekday = self._gen_weekday(x=x)
        gen_quarter = self._gen_quarter(x=x)
        gen_is_weekend = self._gen_is_weekend(x=x)

        df = pd.concat(
            [
                df.T
                for df in [
                    gen_day,
                    gen_month,
                    gen_year,
                    gen_weekday,
                    gen_quarter,
                    gen_is_weekend,
                ]
            ],
            axis=0,
        ).T
        df = df[self.gen_cols]
        return df

    def _gen_day(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.apply(lambda x: x.dt.day).add_suffix(self.DAY)

    def _gen_month(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.apply(lambda x: x.dt.month).add_suffix(self.MONTH)

    def _gen_year(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.apply(lambda x: x.dt.year).add_suffix(self.YEAR)

    def _gen_weekday(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.apply(lambda x: x.dt.day_name()).add_suffix(self.WEEKDAY)

    def _gen_quarter(self, x: pd.DataFrame) -> pd.DataFrame:
        return x.apply(lambda x: x.dt.quarter).add_suffix(self.QUARTER)

    def _gen_is_weekend(self, x: pd.DataFrame) -> pd.DataFrame:
        return (
            x.apply(lambda x: x.dt.weekday > 4).astype(int).add_suffix(self.IS_WEEKEND)
        )

### Using the transformer

In [44]:
df = pd.DataFrame(
            {
                "A": pd.to_datetime(
                    ["15/10/2019 00:00", "16/10/2019 00:00", "17/10/2019 00:00", None],
                    infer_datetime_format=True,
                ),
                "B": pd.to_datetime(
                    ["14/10/2019 00:00", "12/10/2019 00:00", "19/12/2019 00:00", None],
                    infer_datetime_format=True,
                ),
            }
        )

gen = DateGenProcessor()
gen.fit(df)
tf_df = gen.transform(df)

display(df.head())
display(tf_df.head())

Unnamed: 0,A,B
0,2019-10-15,2019-10-14
1,2019-10-16,2019-10-12
2,2019-10-17,2019-12-19
3,NaT,NaT


Unnamed: 0,A__DAY,A__MONTH,A__YEAR,A__WEEKDAY,A__QUARTER,A__IS_WEEKEND,B__DAY,B__MONTH,B__YEAR,B__WEEKDAY,B__QUARTER,B__IS_WEEKEND
0,15.0,10.0,2019.0,Tuesday,4.0,0,14.0,10.0,2019.0,Monday,4.0,0
1,16.0,10.0,2019.0,Wednesday,4.0,0,12.0,10.0,2019.0,Saturday,4.0,1
2,17.0,10.0,2019.0,Thursday,4.0,0,19.0,12.0,2019.0,Thursday,4.0,0
3,,,,,,0,,,,,,0


## Permutation Importance

In [45]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv("https://raw.githubusercontent.com/susmitpy/ThakurCollegeTechnicalSeminar/main/titanic_train.csv")
X, y = df[["pclass", "sibsp", "fare", "age", "parch", "sex"]], df["survived"]

In [46]:
X = pd.get_dummies(X, columns=["sex"], drop_first=True)
X.head()

Unnamed: 0,pclass,sibsp,fare,age,parch,sex_male
0,3,1,7.25,22.0,0,1
1,1,1,71.2833,38.0,0,0
2,3,0,7.925,26.0,0,0
3,1,1,53.1,35.0,0,0
4,3,0,8.05,35.0,0,1


In [47]:
X.isnull().sum()

pclass        0
sibsp         0
fare          0
age         177
parch         0
sex_male      0
dtype: int64

In [48]:
X = X.fillna({"age": X["age"].mean()})

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [50]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [51]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.82      0.86      0.84       139
         Yes       0.74      0.69      0.72        84

    accuracy                           0.79       223
   macro avg       0.78      0.77      0.78       223
weighted avg       0.79      0.79      0.79       223



In [52]:
r = permutation_importance(model, X_test, y_test, n_repeats=30,random_state=0)

for i in r.importances_mean.argsort()[::-1]:
      print(f"{X_test.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

sex_male 0.221 +/- 0.027
pclass   0.043 +/- 0.014
age      0.014 +/- 0.016
sibsp    0.007 +/- 0.011
parch    0.004 +/- 0.003
fare     -0.003 +/- 0.006
