In [1]:
import re
import pandas as pd

from utils.common_transformers import *
from utils.common_encoders import CommonOrdinalEncoder
from common import get_original_column_names

from datetime import datetime
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
df = pd.DataFrame() 
df["DT"]= pd.Series([datetime(2021,2,1,8,0),datetime(2021,5,10,13,0),datetime(2021,7,10,13,0),datetime(2021,10,2,18,22)])

dh = DateHandler(["DT"],date_format="%Y-%m-%d",return_whole_df=True,drop_original_col=False,exclude=["day_num"],close_to_start_month_end_param=5)
th = TimeHandler(["DT"],time_format="%H:%M",return_whole_df=True,drop_original_col=False,exclude=["period_num"])
dd = DateDiff(["DT"],date_format="%Y-%m-%d",return_whole_df=True,drop_original_col=False)

pipe = make_pipeline(dh,th,dd)
pipe.fit_transform(df)

Unnamed: 0,DT_minute,DT_year,DT,DT_month,DT_close_to_month_start_end,DT_military_time,DT_period_name,DT_day,DT_is_weekend,DT_hour,DT_is_holiday,DT_day_name,DT_quater,DT_date_diff
0,0,2021,2021-02-01 08:00:00,2,True,80,Night,1,False,8,False,Monday,1,0
1,0,2021,2021-05-10 13:00:00,5,False,130,Night,10,False,13,False,Monday,2,98
2,0,2021,2021-07-10 13:00:00,7,False,130,Night,10,True,13,False,Saturday,3,159
3,22,2021,2021-10-02 18:22:00,10,True,1822,Evening,2,True,18,True,Saturday,4,243


In [3]:
df = pd.DataFrame({
    "A" : ["M", "F", "M", None],
    "B" : [1,2,None,4],
    "C" : [101,102,103,104]
})

pipe = make_pipeline(
    IsNull(drop_original_col=False,exclude=["C"]),
    NullPct() # By default columns ending with _is_null are excluded by default
)

pipe.fit_transform(df)

Unnamed: 0,B,A,C,A_is_null,B_is_null,null_pct
0,1.0,M,101,False,False,0.0
1,2.0,F,102,False,False,0.0
2,,M,103,False,True,20.0
3,4.0,,104,True,False,20.0


In [4]:
df = pd.DataFrame({
    "to_drop" : [1,2,3,4],
    "to_pass" : [11,12,13,14],
    "to_exclude" : [21,22,23,24],
    "marks" : [3,4,3,20],
    "enc" : ["Cat1","Cat2","Cat3","Cat2"]
})

pipe = Pipeline([
    ("subset",ColumnSelector(["to_drop","to_pass","enc","marks"])),
    ("ct",ColumnTransformer([
        ("ColsDropper",Dropper(["to_drop"]),["to_drop"]),
        ("Pass",PassThrough(),["to_pass","marks"]),
        ("Encoding",CommonOrdinalEncoder(),["enc"]), 
    ])),
    ("col_names",ColumnNameApplyer(["to_pass","marks","enc"])), # ColumnTransformer returns numpy array
    ("DTypeTrans",DTypeTransformer({"enc":"int64"})),
    ("Outlier",OutlierDetector(include=["marks"]))
])

pipe.fit_transform(df)

Unnamed: 0,to_pass,marks,enc,marks_is_outlier
0,11.0,3.0,0,False
1,12.0,4.0,1,False
2,13.0,3.0,2,False
3,14.0,20.0,1,True
