In [24]:
import pandas as pd
import numpy as np
from itertools import combinations


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train['title'] = train['Name'].str.split(',', expand= True)[1].str.split('.', expand = True)[0].str.strip()
train['apha_ticket'] = np.where(train['Ticket'].str.contains('[A-Za-z]') , 1, 0)
train['cabin_type'] = train['Cabin'].str[:1]
train['Pclass'] = train['Pclass'].astype(str)

In [5]:
test['title'] = test['Name'].str.split(',', expand= True)[1].str.split('.', expand = True)[0].str.strip()
test['apha_ticket'] = np.where(test['Ticket'].str.contains('[A-Za-z]') , 1, 0)
test['cabin_type'] = test['Cabin'].str[:1]
test['Pclass'] = test['Pclass'].astype(str)

In [6]:
train = train.drop(['Name','Ticket', 'Cabin', 'PassengerId'], axis = 1)
test = test.drop(['Name','Ticket', 'Cabin', 'PassengerId'], axis = 1)

In [7]:
test['Survived'] = None

In [8]:
from feature_engine.imputation import MeanMedianImputer

median_imputer = MeanMedianImputer(
                   imputation_method='median',
                   variables=['Age', 'Fare']
                   )

# fit the imputer
median_imputer.fit(train)


In [9]:
train= median_imputer.transform(train)
test= median_imputer.transform(test)
train[train.columns.drop('Survived')] = train[train.columns.drop('Survived')].fillna('Missing')
test[test.columns.drop('Survived')] = test[test.columns.drop('Survived')].fillna('Missing')


In [11]:
test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,title,apha_ticket,cabin_type
0,,3,male,34.5,0,0,7.8292,Q,Mr,0,Missing
1,,3,female,47.0,1,0,7.0,S,Mrs,0,Missing
2,,2,male,62.0,0,0,9.6875,Q,Mr,0,Missing
3,,3,male,27.0,0,0,8.6625,S,Mr,0,Missing
4,,3,female,22.0,1,1,12.2875,S,Mrs,0,Missing


In [12]:
train['Parch'].value_counts()

Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

In [13]:
from feature_engine.encoding import *
from feature_engine.discretisation import *

In [14]:
cat_var = ['Sex', 'title', 'cabin_type', 'Embarked', 'Pclass', 'Parch', 'SibSp']

num_var = ['Age', 'Fare']
y_var = 'Survived'
regression = False

In [15]:
!pip install datasets



## discretitzation


In [16]:
import os
import glob

In [17]:

def feat_discretiser(train, test, num_var, cat_var, y = None, method = ['efd', 'ewd', 'dtd', 'gwd'], bins = 10):
    
    if not os.path.exists('.feataz'):
        os.mkdir('.feataz')
        
    files = glob.glob('.feataz/*')
    for f in files:
        os.remove(f)


    if 'dtd' in method and y == None:
        raise Exception("If you are using a dtd (DecisionTreeDiscretiser) please provide the valid target!!")
    
    efd = EqualFrequencyDiscretiser(q = bins, variables=num_var)
    ewd = EqualWidthDiscretiser(bins = bins, variables=num_var)
    dtd = DecisionTreeDiscretiser(cv = 3, 
                                    scoring='neg_mean_squared_error',
                                    variables = num_var,
                                    regression = False)
    gwd = GeometricWidthDiscretiser(bins = bins,
                                      variables = num_var)
    

    method_all = {'efd': efd,
                 'ewd': ewd,
                 'dtd' : dtd,
                  'gwd': gwd
                 }
    
    for i in method:
        print(i)
        dsc = method_all.get(i)
        if i == 'dtd':
            dsc.fit(train[num_var], train[y])
        else:
            dsc.fit(train[num_var])
            
        train_dsc = dsc.transform(train[num_var])
        test_dsc = dsc.transform(test[num_var])
        
        if i == 'dtd':
            train_dsc = train_dsc.rank(method = 'dense')
            test_dsc = test_dsc.rank(method = 'dense')
            
        train_dsc.columns=  [f'{i}_{x}' for x in num_var]
        test_dsc.columns =  [f'{i}_{x}' for x in num_var]


        
        for j in train_dsc.columns:
            train_dsc[[j]].to_parquet(f'.feataz/train_{j}.parquet')
            test_dsc[[j]].to_parquet(f'.feataz/test_{j}.parquet')
    print('Done: features saved in ./.feat directory!')
    return None


In [18]:
dsc = feat_discretiser(train, test, num_var, cat_var, y = 'Survived')

efd
ewd
dtd
gwd
Done: features saved in ./.feat directory!


In [19]:
def feat_encoding(train, test, num_var, cat_var, y = None, method = ['ohe', 'cfe', 'me', 'woe', 'dte'], bins = 10):
    
    if ('dte' in method or 'me' in method or 'woe' in method) and y == None:
        raise Exception("If you are using a dtd (DecisionTreeDiscretiser) please provide the valid target!!")
    
    ohe = OneHotEncoder(
        top_categories=3,
        variables= cat_var,
        drop_last_binary=True
    )
    
    cfe = CountFrequencyEncoder(
        encoding_method = 'frequency',
        variables = cat_var,
        ignore_format = True
    )
    
    
    me = MeanEncoder(
        variables = cat_var,
        ignore_format = True
    )
    
    rle = RareLabelEncoder(
        tol=0.1,
        n_categories=2,
        variables=cat_var,
        ignore_format=True,
    )
    
    woe = WoEEncoder(
        variables= cat_var,
        ignore_format=True,
    )
    
    dte = DecisionTreeEncoder(
        variables=cat_var,
        regression=False,
        scoring='roc_auc',
        cv=3,
        random_state=0,
        ignore_format=True)

    method_all = {'ohe': ohe,
                 'cfe': cfe,
                 'me' : me,
                  'woe': woe,
                  'dte' : dte
                 }

    for cl in cat_var:
        train[cl] = train[cl].astype(str)
        test[cl] = test[cl].astype(str)

    for i in method:
        print(i)
        dsc = method_all.get(i)
        if i in ['dte', 'woe']:
            train_t = rle.fit_transform(train)
            test_t = rle.transform(test)
            dsc.fit(train_t[cat_var], train_t[y])
            train_dsc = dsc.transform(train_t[cat_var])
            test_dsc = dsc.transform(test_t[cat_var])
            
        elif i == 'me':
            dsc.fit(train[cat_var], train[y])
            train_dsc = dsc.transform(train[cat_var])
            test_dsc = dsc.transform(test[cat_var])
        else:
            dsc.fit(train[cat_var])
            train_dsc = dsc.transform(train[cat_var])
            test_dsc = dsc.transform(test[cat_var])
        
        # if i == 'dtd':
        #     train_dsc = train_dsc.rank(method = 'dense')
        #     test_dsc = test_dsc.rank(method = 'dense')
            
        train_dsc.columns=  [f'{i}_{x}' for x in train_dsc.columns]
        test_dsc.columns =  [f'{i}_{x}' for x in test_dsc.columns]

        for j in train_dsc.columns:
            train_dsc[[j]].to_parquet(f'.feataz/train_{j}.parquet')
            test_dsc[[j]].to_parquet(f'.feataz/test_{j}.parquet')
            
    print('Done: features saved in ./.feat directory!')
    return None


In [20]:
from datasets import load_dataset

enc = feat_encoding(train, test, num_var, cat_var, y = 'Survived')

ohe
rle
cfe
me




woe
dte
Done: features saved in ./.feat directory!


In [21]:
import warnings
class FeatInteraction():
    def __init__(self, group, 
                 value = None
                 , metric = ['sum','min', 'max', 'mean', 'median', 'std'],
                 date_index = None,
                 date_feature = ['15d', '45d' , '1m', '2m', '3m', '6m', '12m', '2y']) -> None:
        if not isinstance(group, list):
            if isinstance(group, str):
                group = [group]
            else:
                group = list(group)
        
        super().__init__()

        self.group = group
        self.value = value
        self.metric = metric
        self.date_index = date_index
        self.date_feature = date_feature
    
    def fit(self, df):
        if not self.value:
            warnings.warn('value are set to None, the metric paramaters will be ignore. The metric will be set to count')

        #df = super().fit(df)
        if self.date_index:
            warnings.warn('The feature creation based using date are not available yet. please staytune for the update')
            self.date_index = None



        if not self.date_index :
            if not self.value:
                grp = df.groupby(self.group).size().reset_index()
                grp.columns = grp + ['count_' + '_'.join(grp)]
                
            else:
                grp = df.groupby(self.group)[self.value].agg(self.metric).reset_index()
                cols = grp.columns[grp.columns.isin(self.group) == False]

                cols = [f'{self.value}_{x}_by_' + '_'.join(self.group) for x in cols]
                grp.columns = self.group + cols
        self.grp = grp
        return self
    

    def transform(self, df):

        res = df.merge(self.grp, how = 'left', on = self.group)

        return res


In [22]:
def create_fi(train, test, num_var, cat_var, metric = ['sum','min', 'max', 'mean', 'median', 'std']):
    for i in range(1, len(cat_var)+1, 1) : # 
        for j in num_var:
           for k in (list(combinations(cat_var, i)) ):
                fi = FeatInteraction(list(k), j)
                vars = list(k) +[j]
                fi.fit(train[vars])
                tr1 = fi.transform(train[vars]).drop(vars, axis = 1)
                ts1 = fi.transform(test[vars]).drop(vars, axis = 1)
                for l in tr1.columns:
                    tr1[[l]].to_parquet(f'.feataz/train_fi_{l}.parquet')
                    ts1[[l]].to_parquet(f'.feataz/test_fi_{l}.parquet')
                    print(f'Done: feature {l} saved in ./.feat directory!')
    return None

In [25]:
create_fi(train, test, num_var, cat_var)

Done: feature Age_sum_by_Sex saved in ./.feat directory!
Done: feature Age_min_by_Sex saved in ./.feat directory!
Done: feature Age_max_by_Sex saved in ./.feat directory!
Done: feature Age_mean_by_Sex saved in ./.feat directory!
Done: feature Age_median_by_Sex saved in ./.feat directory!
Done: feature Age_std_by_Sex saved in ./.feat directory!
Done: feature Age_sum_by_title saved in ./.feat directory!
Done: feature Age_min_by_title saved in ./.feat directory!
Done: feature Age_max_by_title saved in ./.feat directory!
Done: feature Age_mean_by_title saved in ./.feat directory!
Done: feature Age_median_by_title saved in ./.feat directory!
Done: feature Age_std_by_title saved in ./.feat directory!
Done: feature Age_sum_by_cabin_type saved in ./.feat directory!
Done: feature Age_min_by_cabin_type saved in ./.feat directory!
Done: feature Age_max_by_cabin_type saved in ./.feat directory!
Done: feature Age_mean_by_cabin_type saved in ./.feat directory!
Done: feature Age_median_by_cabin_type 

In [26]:
from feature_engine.creation import RelativeFeatures, MathFeatures, CyclicalFeatures
from feature_engine.selection import DropConstantFeatures


def feature_combination_calc(train, test, num_var, method = ['mf', 'rf', 'cf']):
    
    mf = MathFeatures(
        variables=num_var,
        func = ["sum", "prod", "min", "max", "std"],
    )
    

    rf = RelativeFeatures(
        variables=num_var,
        reference=num_var,
        func = ["sub", "div", "mod", "add", "truediv", "floordiv", "mul"],
    )
    
    cf = CyclicalFeatures(variables=num_var, drop_original=False)
    
    method_all = {'mf': mf,
                 'rf': rf,
                 'cf' : cf
                 }

    for i in method:
        transformer = method_all.get(i)
        
        if i == 'rf':
            train_inp = train[num_var] + 1
            test_inp = test[num_var] + 1

            k = 1
            while ((train_inp[num_var] == 0).sum() + (test_inp[num_var] == 0).sum()).sum() > 0 :
                print(k)
                train_inp = train[num_var] + k
                test_inp = test[num_var] + k                
            
            train_t = transformer.fit_transform(train_inp[num_var])
            test_t = transformer.fit_transform(test_inp[num_var])
        else:
            train_t = transformer.fit_transform(train[num_var])
            test_t = transformer.fit_transform(test[num_var])

        drp = DropConstantFeatures(tol=0.85)

        drp.fit(train_t)

        feat_drp = list(dict.fromkeys(drp.features_to_drop_ + num_var) )

        train_t.drop(feat_drp, axis = 1, inplace = True)
        test_t.drop(feat_drp, axis = 1, inplace = True)        
        
        for j in train_t.columns:
            train_t[[j]].to_parquet(f'.feataz/train_fc_{j}.parquet')
            test_t[[j]].to_parquet(f'.feataz/test_fc_{j}.parquet')
            
            print(f'Done: features {j} saved in ./.feat directory!')



In [27]:
feature_combination_calc(train, test, num_var)

Done: features sum_Age_Fare saved in ./.feat directory!
Done: features prod_Age_Fare saved in ./.feat directory!
Done: features min_Age_Fare saved in ./.feat directory!
Done: features max_Age_Fare saved in ./.feat directory!
Done: features std_Age_Fare saved in ./.feat directory!
Done: features Fare_sub_Age saved in ./.feat directory!
Done: features Age_sub_Fare saved in ./.feat directory!
Done: features Fare_div_Age saved in ./.feat directory!
Done: features Age_div_Fare saved in ./.feat directory!
Done: features Fare_mod_Age saved in ./.feat directory!
Done: features Age_mod_Fare saved in ./.feat directory!
Done: features Age_add_Age saved in ./.feat directory!
Done: features Fare_add_Age saved in ./.feat directory!
Done: features Age_add_Fare saved in ./.feat directory!
Done: features Fare_add_Fare saved in ./.feat directory!
Done: features Fare_truediv_Age saved in ./.feat directory!
Done: features Age_truediv_Fare saved in ./.feat directory!
Done: features Fare_floordiv_Age saved 

In [28]:
import scorecardpy as sc

def feat_sc_woe(train, test, y, variables):
    for l in variables:
        try:
            bins = sc.woebin(train[[l, y]], y=y)
            train_woe = sc.woebin_ply(train[[l, y]], bins)
            test_woe = sc.woebin_ply(test[[l, y]], bins)
            train_woe[[l+'_woe']].to_parquet(f'.feataz/train_woesc_{l}.parquet')
            test_woe[[l+'_woe']].to_parquet(f'.feataz/test_woesc_{l}.parquet')
            print(f'Done: feature {l} saved in ./.feat directory!')
        except:
             warnings.warn(f'error occured while creating WOE for {l}')

In [29]:
feat_sc_woe(train, test, y = 'Survived', variables = num_var + cat_var)

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
Length: 891
Categories (36, object): ['[-inf,2.0)' < '[2.0,4.0)' < '[4.0,6.0)' < '[6.0,8.0)' ... '[64.0,66.0)' < '[66.0,68.0)' < '[68.0,72.0)' < '[72.0,inf)']' has dtype incompatible with category, please explicitly cast to a compatible dtype first.
  dtm.loc[:,'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
  binning = dtm.groupby(['variable','bin'], group_keys=False)['y'].agg([n0, n1])\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  ini

[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Age saved in ./.feat directory!
[INFO] creating woe binning ...


  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(b

[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Fare saved in ./.feat directory!
[INFO] creating woe binning ...
[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Sex saved in ./.feat directory!
[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin =

[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature title saved in ./.feat directory!
[INFO] creating woe binning ...
[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature cabin_type saved in ./.feat directory!
[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_

[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Embarked saved in ./.feat directory!
[INFO] creating woe binning ...
[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Pclass saved in ./.feat directory!
[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':su

[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature Parch saved in ./.feat directory!
[INFO] creating woe binning ...
[INFO] converting into woe values ...
[INFO] converting into woe values ...
Done: feature SibSp saved in ./.feat directory!


  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .a

In [None]:
train_woe

In [None]:
feat_sc_woe(train, test, y = 'Survived', variables = num_var + cat_var)

In [None]:
from feature_engine.creation import RelativeFeatures, MathFeatures, CyclicalFeatures

train_t = transformer.fit_transform(train_inp)

test_t = transformer.fit_transform(test_inp)


In [None]:
from feature_engine.selection import DropConstantFeatures
# fit the transformer
transformer = DropConstantFeatures(tol=0.9)
transformer.fit(train_t)

In [None]:
transformer

In [None]:
transformer.features_to_drop_


In [None]:


X = cyclical.fit_transform(train[num_var])
X = cyclical.fit_transform(train[num_var])

X

In [None]:
for i in range(1, len(cat_var)+1, 1) : # 
    for j in num_var:
       for k in (list(combinations(cat_var, i)) ):
            fi = FeatInteraction(list(k), j)
            vars = list(k) +[j]
            fi.fit(train[vars])
            tr1 = fi.transform(train[vars]).drop(vars, axis = 1)
            ts1 = fi.transform(test[vars]).drop(vars, axis = 1)
            for l in tr1.columns:
                tr1[[l]].to_parquet(f'.feataz/fi_train_{l}.parquet')
                ts1[[l]].to_parquet(f'.feataz/fi_test_{l}.parquet')
                print(f'Done: feature {l} saved in ./.feat directory!')

In [None]:
# Traditional Credit Scoring Using Logistic Regression
import scorecardpy as sc

# data prepare ------
# load germancredit data
dat = sc.germancredit()

# filter variable via missing rate, iv, identical value rate
dt_s = sc.var_filter(dat, y="creditability")

# breaking dt into train and test
train, test = sc.split_df(dt_s, 'creditability').values()

# woe binning ------
bins = sc.woebin(dt_s, y="creditability")
# sc.woebin_plot(bins)

# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins) 
# # or specify breaks manually
breaks_adj = {
    'age.in.years': [26, 35, 40],
    'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]
}
bins_adj = sc.woebin(dt_s, y="creditability", breaks_list=breaks_adj)

# converting train and test into woe values
train_woe = sc.woebin_ply(train, bins_adj)
test_woe = sc.woebin_ply(test, bins_adj)

y_train = train_woe.loc[:,'creditability']
X_train = train_woe.loc[:,train_woe.columns != 'creditability']
y_test = test_woe.loc[:,'creditability']
X_test = test_woe.loc[:,train_woe.columns != 'creditability']

# logistic regression ------
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(X_train, y_train)
# lr.coef_
# lr.intercept_

# predicted proability
train_pred = lr.predict_proba(X_train)[:,1]
test_pred = lr.predict_proba(X_test)[:,1]

# performance ks & roc ------
train_perf = sc.perf_eva(y_train, train_pred, title = "train")
test_perf = sc.perf_eva(y_test, test_pred, title = "test")

# score ------
card = sc.scorecard(bins_adj, lr, X_train.columns)
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)

# psi
sc.perf_psi(
  score = {'train':train_score, 'test':test_score},
  label = {'train':y_train, 'test':y_test}
)

In [None]:

disc = EqualFrequencyDiscretiser(q = 10, variables=num_var)

disc.fit(train[num_var])

train_efd = disc.transform(train[num_var])
test_efd = disc.transform(test[num_var])


In [None]:
disc = EqualWidthDiscretiser(bins = 10, variables=num_var)

disc.fit(train[num_var])

train_ewd = disc.transform(train[num_var])
test_ewd = disc.transform(test[num_var])


In [None]:
test_ewd

In [None]:
disc = DecisionTreeDiscretiser(cv = 3, 
                            scoring='neg_mean_squared_error',
                            variables = num_var,
                            regression = False)

disc.fit(train[num_var], train[y_var])

train_dtd = disc.transform(train[num_var])
test_dtd = disc.transform(test[num_var])       


In [None]:
disc = GeometricWidthDiscretiser(bins = 10,
                                  variables = num_var)

disc.fit(train[num_var])

train_gwd = disc.transform(train[num_var])

test_gwd = disc.transform(test[num_var])


In [None]:
# disc = DecisionTreeDiscretiser(cv = 3, 
#                             scoring='neg_mean_squared_error',
#                             variables = num_var,
#                             regression = False)

# disc.fit(train[num_var], train[y_var])

# train_dtd = disc.transform(train[num_var])
# test_dtd = disc.transform(test[num_var])     

In [None]:
ohe = OneHotEncoder(
    top_categories=3,
    variables= cat_var
)

cfe = CountFrequencyEncoder(
    encoding_method = 'frequency',
    variables = cat_var,
    ignore_format = True
)


me = MeanEncoder(
    variables = cat_var,
    ignore_format = True
)

rle = RareLabelEncoder(
    tol=0.1,
    n_categories=2,
    variables=cat_var,
    ignore_format=True,
)

woe = WoEEncoder(
    variables= cat_var,
    ignore_format=True,
)

dte = DecisionTreeEncoder(
    variables=cat_var,
    regression=False,
    scoring='roc_auc',
    cv=3,
    random_state=0,
    ignore_format=True)

## Encoding

In [None]:
encoder = OneHotEncoder(
    top_categories=3,
    variables= ['Sex', 'title', 'cabin_type', 'Embarked', 'Pclass']
)
# fit the encoder
encoder.fit(train[['Sex', 'title', 'cabin_type', 'Embarked', 'Pclass']])

In [None]:
one_hot_train= encoder.transform(train[['Sex', 'title', 'cabin_type', 'Embarked', 'Pclass']])
one_hot_test= encoder.transform(test[['Sex', 'title', 'cabin_type', 'Embarked', 'Pclass']])

In [None]:
enc = CountFrequencyEncoder(
    encoding_method = 'frequency',
    variables = cat_var,
    ignore_format = True
)

In [None]:
enc.fit(train[cat_var])


In [None]:
cnt_freq_train= enc.transform(train[cat_var])
cnt_freq_test= enc.transform(test[cat_var])

In [None]:
enc = MeanEncoder(
    variables = cat_var,
    ignore_format = True
)

In [None]:
enc.fit(train[cat_var], train['Survived'])
mean_enc_train= enc.transform(train[cat_var])
mean_enc_test= enc.transform(test[cat_var])

In [None]:
# set up a weight of evidence encoder
# set up a rare label encoder
rare_encoder = RareLabelEncoder(
    tol=0.1,
    n_categories=2,
    variables=cat_var,
    ignore_format=True,
)

# fit and transform data
train_t = rare_encoder.fit_transform(train)
test_t = rare_encoder.transform(test)


woe_encoder = WoEEncoder(
    variables= cat_var,
    ignore_format=True,
)

# fit the encoder
woe_encoder.fit(train_t[cat_var], train_t['Survived'])
woe_train= woe_encoder.transform(train_t[cat_var])
woe_test= woe_encoder.transform(test_t[cat_var])

In [None]:
encoder = DecisionTreeEncoder(
    variables=cat_var,
    regression=False,
    scoring='roc_auc',
    cv=3,
    random_state=0,
    ignore_format=True)

encoder.fit(train_t[cat_var], train_t['Survived'])

In [None]:
dec_train= encoder.transform(train_t[cat_var])
dec_test= encoder.transform(test_t[cat_var])

In [None]:
train_dtd.rank(method = 'dense')['Age'].value_counts()
# ['average', 'min', 'max', 'first', 'dense']

## Feature Creation

In [None]:
from feature_engine.creation import MathFeatures

transformer = MathFeatures(
    variables=["Age", "Marks"],
    func = ["sum", "prod", "min", "max", "std"],
)

df_t = transformer.fit_transform(df)

print(df_t)


In [None]:
import warnings
class FeatInteraction():
    def __init__(self, group, 
                 value = None
                 , metric = ['min', 'max', 'mean', 'median', 'std'],
                 date_index = None,
                 date_feature = ['15d', '45d' , '1m', '2m', '3m', '6m', '12m', '2y']) -> None:
        if not isinstance(group, list):
            if isinstance(group, str):
                group = [group]
            else:
                group = list(group)
        
        super().__init__()

        self.group = group
        self.value = value
        self.metric = metric
        self.date_index = date_index
        self.date_feature = date_feature
    
    def fit(self, df):
        if not self.value:
            warnings.warn('value are set to None, the metric paramaters will be ignore. The metric will be set to count')

        #df = super().fit(df)
        if self.date_index:
            warnings.warn('The feature creation based using date are not available yet. please staytune for the update')
            self.date_index = None



        if not self.date_index :
            if not self.value:
                grp = df.groupby(self.group).size().reset_index()
                grp.columns = grp + ['count_' + '_'.join(grp)]
                
            else:
                grp = df.groupby(self.group)[self.value].agg(self.metric).reset_index()
                cols = grp.columns[grp.columns.isin(self.group) == False]

                cols = [f'{self.value}_{x}_of_' + '_'.join(self.group) for x in cols]
                grp.columns = self.group + cols
        self.grp = grp
        return self
    

    def transform(self, df):

        res = df.merge(self.grp, how = 'left', on = self.group)

        return res


In [None]:
feat_int = FeatInteraction(['Sex', 'Pclass'], 'Age')

In [None]:
feat_int.fit(train)

In [None]:
feat_int.transform(train)

In [None]:
feat_int.transform(test)