# Notebook Demonstrating use of Transformers in common_transformers

### First some imports

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.base import TransformerMixin,BaseEstimator

# To Demonstrate
from utils.common_transformers import DateHandler, TimeHandler, DateDiff, NullPct, OutlierHandler, PassThrough, Dropper, FeatureSelector

pd.set_option("display.max_columns",None)

## Dummy Data

In [32]:
df = pd.DataFrame({
    "ID" : [1,2,3,4,5,6,7,8,9,10],
    "InvoiceDate" : [
        "2020/1/1 10:00",
        "2020/2/2 11:00",
        "2020/3/3 12:00",
        "2020/4/4 13:00",
        "2020/5/5 10:00",
        "2020/6/6 11:00",
        "2020/7/7 12:15",
        "2020/8/8 17:24",
        "2020/9/9 18:00",
        "2020/10/10 19:00",     
    ],
    "Age" : [
        24,26,27,None,24,23,22,24,21,None
    ],
    "Fare" : [
        10,11,10,11,10,12,30,75,27,None
    ],
    "Gender" : [
        "M","F",None,"M","F","M","F","M","F","M"
    ]
})
df

Unnamed: 0,ID,InvoiceDate,Age,Fare,Gender
0,1,2020/1/1 10:00,24.0,10.0,M
1,2,2020/2/2 11:00,26.0,11.0,F
2,3,2020/3/3 12:00,27.0,10.0,
3,4,2020/4/4 13:00,,11.0,M
4,5,2020/5/5 10:00,24.0,10.0,F
5,6,2020/6/6 11:00,23.0,12.0,M
6,7,2020/7/7 12:15,22.0,30.0,F
7,8,2020/8/8 17:24,24.0,75.0,M
8,9,2020/9/9 18:00,21.0,27.0,F
9,10,2020/10/10 19:00,,,M


In [6]:
a = [1,2,3,4,5]
b = [1,3,5]

[col for col in a if col not in b]

[2, 4]

In [33]:
class Common:
    def get_cols_needed(self,cols,include=None,exclude=None):
        if include == None and exclude == None:
            return cols
        
        if exclude == None:
            return include
        
        if include == None:
            return [col for col in cols if col not in exclude]
        
        raise Exception("Either specify include or exclude or None. Both cannot be specified")
        

class DateHandler(TransformerMixin,BaseEstimator,Common):
    """ Splits the date into day, month, year
        Creates features such as day_name, day_num,is_weekend,close_to_month_start_end,is_holiday,quater
    """
    def __init__(self,date_cols_names:list,return_whole_df = True,drop_original_col = True,date_format = "%Y/%m/%d",close_to_start_month_end_param = 5,include=None,exclude=None):
        """
        close_to_start_month_end_param: No. of days specifiying how close is a given date close to start, end of a month
        Example: 2, for January, close dates will be 1st, 2nd, 30th and 31st January
        """
        self.return_whole_df = return_whole_df
        self.cols = []
        self.date_cols_names = date_cols_names
        self.added_cols_suffix = ["day","month","year","day_name","day_num","is_weekend","close_to_month_start_end","is_holiday","quater"]
        self.added_cols = []
        self.drop_original_col = drop_original_col
        self.date_format = date_format
        self.close_to_start_month_end_param = close_to_start_month_end_param
        self.include = include
        self.exclude = exclude


    def fit(self,df,y=None):
        self.cols = list(df.columns)
        self.included_cols = self.get_cols_needed(self.added_cols_suffix,self.include,self.exclude)
        for date_col_name in self.date_cols_names:
            if self.drop_original_col:
                self.cols.remove(date_col_name)
            self.added_cols.extend([date_col_name + "_" + i for i in self.added_cols_suffix if i in self.included_cols])
        return self

    def transform(self,df,*_):
        copy = df.copy()
        for date_col_name in self.date_cols_names:
            copy[date_col_name] = pd.to_datetime(copy[date_col_name],format=self.date_format)
            
            if "close_to_month_start_end" in self.included_cols:
                temp = pd.DataFrame(index=copy.index)
                temp["Days_In_Month"] = copy[date_col_name].dt.days_in_month
                temp["last_day_minus_current"] = temp["Days_In_Month"] - copy[date_col_name].dt.day
                temp["current_minus_first_day"] = copy[date_col_name].dt.day - 1
                temp["First_Last_Few_Days_Of_Month"] = temp[["last_day_minus_current","current_minus_first_day"]].min(axis=1)
                copy[date_col_name+"_close_to_month_start_end"] = np.where(temp["First_Last_Few_Days_Of_Month"]<=self.close_to_start_month_end_param,1,0)
            
            if "day" in self.included_cols:
                copy[date_col_name+"_day"] = copy[date_col_name].dt.day
                
            if "month" in self.included_cols:    
                copy[date_col_name+"_month"] = copy[date_col_name].dt.month
                
            if "year" in self.included_cols:
                copy[date_col_name+"_year"] = copy[date_col_name].dt.year
                
            if "day_name" in self.included_cols:
                copy[date_col_name+"_day_name"] = copy[date_col_name].dt.day_name()
                
            if "day_num" in self.included_cols:
                copy[date_col_name+"_day_num"] = copy[date_col_name].dt.weekday
                
            if "is_weekend" in self.included_cols:
                copy[date_col_name+"_is_weekend"] = copy[date_col_name].dt.day_name().isin(["Sunday","Saturday"]).map({True:1,False:0})
                
            if "is_holiday" in self.included_cols:
                copy[date_col_name+"_is_holiday"] = copy[date_col_name].map(self.is_holiday)
                
            if "quater" in self.included_cols:
                copy[date_col_name+"_quater"] = copy[date_col_name].map(self.get_quater)
   
        if self.return_whole_df:
            return copy[self.cols+self.added_cols]

        if self.drop_original_col:
            return copy[self.added_cols]

        return copy[self.date_cols_names + self.added_cols]

    def is_holiday(self,date):
        return date in holidays.India(years=date.year)
    
    def get_quater(self,date):
        if date.month < 4:
            return 1
        elif date.month < 7:
            return 2
        elif date.month < 10:
            return 3
        return 4
    
    def get_feature_names(self):
        if self.return_whole_df:
            return self.cols + self.added_cols

        if self.drop_original_col:
            return self.added_cols

        return self.date_cols_names + self.added_cols

In [41]:
dh = DateHandler(["InvoiceDate"],date_format="%Y/%m/%d %H:%M",include=["quater","month"],drop_original_col=False)
dh.fit_transform(df)

Unnamed: 0,ID,InvoiceDate,Age,Fare,Gender,InvoiceDate_month,InvoiceDate_quater
0,1,2020-01-01 10:00:00,24.0,10.0,M,1,1
1,2,2020-02-02 11:00:00,26.0,11.0,F,2,1
2,3,2020-03-03 12:00:00,27.0,10.0,,3,1
3,4,2020-04-04 13:00:00,,11.0,M,4,2
4,5,2020-05-05 10:00:00,24.0,10.0,F,5,2
5,6,2020-06-06 11:00:00,23.0,12.0,M,6,2
6,7,2020-07-07 12:15:00,22.0,30.0,F,7,3
7,8,2020-08-08 17:24:00,24.0,75.0,M,8,3
8,9,2020-09-09 18:00:00,21.0,27.0,F,9,3
9,10,2020-10-10 19:00:00,,,M,10,4


In [18]:
th.included_cols

['hour', 'minute', 'military_time', 'period_name', 'period_num']

### Outlier Handler
 Adds a feature IsOutlier for all numeric columns
 
 Outlier is identified either by using standard deviation or IQR
 
 scipy.stats.shapiro with alpha = 0.05 is used to decide whether a feature follows gaussian distribution or not
 
 The columns which do not follow gaussian distribution can also be spe

In [3]:
outlier_handler = OutlierHandler()
df = outlier_handler.fit_transform(df)

# Just for highighting purpose
df.style.apply(lambda x: ["color: yellow" if (x.Fare_is_outlier and (v == True or v==75.00)) else "" for v in x], axis = 1)

Unnamed: 0,ID,InvoiceDate,Age,Fare,Gender,ID_is_outlier,Age_is_outlier,Fare_is_outlier
0,1,2020/1/1 10:00,24.0,10.0,M,False,False,False
1,2,2020/2/2 11:00,26.0,11.0,F,False,False,False
2,3,2020/3/3 12:00,27.0,10.0,,False,False,False
3,4,2020/4/4 13:00,,11.0,M,False,False,False
4,5,2020/5/5 10:00,24.0,10.0,F,False,False,False
5,6,2020/6/6 11:00,23.0,12.0,M,False,False,False
6,7,2020/7/7 12:15,22.0,30.0,F,False,False,False
7,8,2020/8/8 17:24,24.0,75.0,M,False,False,True
8,9,2020/9/9 18:00,21.0,27.0,F,False,False,False
9,10,2020/10/10 19:00,,,M,False,False,False


### Null Pct
 Adds a feature null_pct which is the % of nulls in the given row
 
 Does not include columns ending with _is_outlier when calculating Null %
 
 Set exclude_cols_for_null_pct for excluding other columns if any

In [4]:
null_handler = NullPct()
df = null_handler.fit_transform(df)
df[["ID","InvoiceDate","Age","Fare","null_pct"]].fillna("NA").style.apply(lambda x: ["color: yellow" if v in ["NA",20.0,20.0,40.0] else "" for v in x])

Unnamed: 0,ID,InvoiceDate,Age,Fare,null_pct
0,1,2020/1/1 10:00,24.0,10.0,0.0
1,2,2020/2/2 11:00,26.0,11.0,0.0
2,3,2020/3/3 12:00,27.0,10.0,20.0
3,4,2020/4/4 13:00,,11.0,20.0
4,5,2020/5/5 10:00,24.0,10.0,0.0
5,6,2020/6/6 11:00,23.0,12.0,0.0
6,7,2020/7/7 12:15,22.0,30.0,0.0
7,8,2020/8/8 17:24,24.0,75.0,0.0
8,9,2020/9/9 18:00,21.0,27.0,0.0
9,10,2020/10/10 19:00,,,40.0


### Date Handler

Splits the date into day, month, year

Creates features such as day_name, day_num,is_weekend,close_to_month_start_end,is_holiday,quater

In [5]:
date_handler = DateHandler(date_cols_names=["InvoiceDate"],drop_original_col=False)
df = date_handler.fit_transform(df)

cols = ["day","month","year","day_name","day_num","is_weekend","close_to_month_start_end","is_holiday","quater"]
df_cols = ["InvoiceDate"] + [f"InvoiceDate_{i}" for i in cols]

df[df_cols]

Unnamed: 0,InvoiceDate,InvoiceDate_day,InvoiceDate_month,InvoiceDate_year,InvoiceDate_day_name,InvoiceDate_day_num,InvoiceDate_is_weekend,InvoiceDate_close_to_month_start_end,InvoiceDate_is_holiday,InvoiceDate_quater
0,2020-01-01 10:00:00,1,1,2020,Wednesday,2,0,1,False,1
1,2020-02-02 11:00:00,2,2,2020,Sunday,6,1,1,False,1
2,2020-03-03 12:00:00,3,3,2020,Tuesday,1,0,1,False,1
3,2020-04-04 13:00:00,4,4,2020,Saturday,5,1,1,False,2
4,2020-05-05 10:00:00,5,5,2020,Tuesday,1,0,1,False,2
5,2020-06-06 11:00:00,6,6,2020,Saturday,5,1,1,False,2
6,2020-07-07 12:15:00,7,7,2020,Tuesday,1,0,0,False,3
7,2020-08-08 17:24:00,8,8,2020,Saturday,5,1,0,False,3
8,2020-09-09 18:00:00,9,9,2020,Wednesday,2,0,0,False,3
9,2020-10-10 19:00:00,10,10,2020,Saturday,5,1,0,False,4


### Date Diff

 Considering the earliest date as 1 transforms all dates and adds a feature (kind of reference pointer)


In [6]:
date_diff = DateDiff(date_cols_names=["InvoiceDate"],drop_original_col=False)
df = date_diff.fit_transform(df)

df[["InvoiceDate","InvoiceDate_date_diff"]]

Unnamed: 0,InvoiceDate,InvoiceDate_date_diff
0,2020-01-01 10:00:00,0
1,2020-02-02 11:00:00,32
2,2020-03-03 12:00:00,62
3,2020-04-04 13:00:00,94
4,2020-05-05 10:00:00,125
5,2020-06-06 11:00:00,157
6,2020-07-07 12:15:00,188
7,2020-08-08 17:24:00,220
8,2020-09-09 18:00:00,252
9,2020-10-10 19:00:00,283


### Time Handler

Splits the time into hour, minute

Creates features such as period, military_time

In [7]:
time_handler = TimeHandler(time_cols_names=["InvoiceDate"],drop_original_col=False)
df = time_handler.fit_transform(df)

cols = ["hour","minute","military_time","period_name","period_num"]
df_cols = ["InvoiceDate"] + [f"InvoiceDate_{i}" for i in cols]
        
df[df_cols]

Unnamed: 0,InvoiceDate,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_military_time,InvoiceDate_period_name,InvoiceDate_period_num
0,2020-01-01 10:00:00,10,0,100,Night,4
1,2020-02-02 11:00:00,11,0,110,Night,4
2,2020-03-03 12:00:00,12,0,120,Night,4
3,2020-04-04 13:00:00,13,0,130,Night,4
4,2020-05-05 10:00:00,10,0,100,Night,4
5,2020-06-06 11:00:00,11,0,110,Night,4
6,2020-07-07 12:15:00,12,15,1215,Afternoon,2
7,2020-08-08 17:24:00,17,24,1724,Evening,3
8,2020-09-09 18:00:00,18,0,180,Night,4
9,2020-10-10 19:00:00,19,0,190,Night,4


## PassThrough, Dropper and Feature Selector

#### Another Example

In [8]:
df = pd.DataFrame({
    "ID" : [1,2,3,4,5],
    "Age" : [
        21,22,23,24,25
    ],
    "Fare" : [
        10,11,12,13,14
    ],
    "Gender" : [
        "M","F","M","F",None
    ],
    "InvoiceDate" : [
        "2020/1/1 10:00",
        "2020/2/2 11:00",
        "2020/3/3 12:00",
        "2020/4/4 13:00",
        "2020/5/5 10:00",    
    ],
})
df

Unnamed: 0,ID,Age,Fare,Gender,InvoiceDate
0,1,21,10,M,2020/1/1 10:00
1,2,22,11,F,2020/2/2 11:00
2,3,23,12,M,2020/3/3 12:00
3,4,24,13,F,2020/4/4 13:00
4,5,25,14,,2020/5/5 10:00


### PassThrough

PassThrough can be useful when you want some features to pass through in a column transformer but want to drop remanining features

In [9]:
col_transformer = make_column_transformer(
    (StandardScaler(),["Age","Fare"]),
    (PassThrough(),["Gender"]),
    remainder="drop"
)

pd.DataFrame(col_transformer.fit_transform(df),columns=["Age","Fare","Gender"])

Unnamed: 0,Age,Fare,Gender
0,-1.41421,-1.41421,M
1,-0.707107,-0.707107,F
2,0.0,0.0,M
3,0.707107,0.707107,F
4,1.41421,1.41421,


Here we dropped ID, but let Gender pass

### Feature Selector and Dropper

Feature Selector can be useful when you want to select some features to continue to flow in the pipeline

Dropper can be useful when you want to drop some columns in a pipeline after some preprocessing

In [10]:
pipe = make_pipeline(
    FeatureSelector(["Age","InvoiceDate"]),
    DateHandler(["InvoiceDate"]),
    Dropper(["InvoiceDate_day","InvoiceDate_year","InvoiceDate_day_num"])
)

pipe.fit_transform(df)

Unnamed: 0,Age,InvoiceDate_month,InvoiceDate_day_name,InvoiceDate_is_weekend,InvoiceDate_close_to_month_start_end,InvoiceDate_is_holiday,InvoiceDate_quater
0,21,1,Wednesday,0,1,False,1
1,22,2,Sunday,1,1,False,1
2,23,3,Tuesday,0,1,False,1
3,24,4,Saturday,1,1,False,2
4,25,5,Tuesday,0,1,False,2
