# Information Value as Feature Selection for Binary Classification Problems


## Session Setup

In [293]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import Pipeline

from pyspark_ds_toolbox.ml.data_prep.features_vector import get_features_vector

In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/29 14:18:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## lendo o dataset base

In [246]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df)\
    .withColumn('etnia', F.expr('case when black=1 then "black" when hisp=1 then "hisp" when marr=1 then "marr" else "other" end'))\
    .withColumn('treat', F.col('treat').cast('int'))\
    .withColumn('dumb_cat', F.expr('case when index > 10 then "a" else "b" end'))\
    .select('index', 'age', 'educ', 'data_id', 'etnia','dumb_cat', 'treat')



dfs_train, dfs_test = df.randomSplit([0.8, 0.2], seed=4)
dfs_test.show(5)

+-----+----+----+--------------------+-----+--------+-----+
|index| age|educ|             data_id|etnia|dumb_cat|treat|
+-----+----+----+--------------------+-----+--------+-----+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|
|    7|18.0|11.0|                CPS1|other|       b|    0|
|   10|19.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|
|   12|18.0| 8.0|Dehejia-Wahba Sample|black|       a|    1|
+-----+----+----+--------------------+-----+--------+-----+
only showing top 5 rows



In [247]:
num_features = ['age', 'educ']
cat_features = ['data_id', 'etnia', 'dumb_cat']

In [248]:
def calculate_woe_iv(dfs, feature, target):
    # testar se o tipo da coluna target é int e os valores unicos são 0 e 1
    cross = dfs\
        .crosstab(feature, target)\
        .withColumnRenamed(f'{feature}_{target}', 'feature_value')
        
    sum_0 = cross.select('0').groupBy().sum().collect()[0][0]
    sum_1 = cross.select('1').groupBy().sum().collect()[0][0]

    
    cross = cross\
        .withColumn('0', F.col('0')/sum_0)\
        .withColumn('1', F.col('1')/sum_1)\
        .withColumn('woe', F.log(F.col('0')/F.col('1')))\
        .withColumn('iv', F.col('woe')*(F.col('0') - F.col('1')))\
        .withColumn('feature', F.lit(feature))\
        .fillna(0)


    iv = cross.selectExpr('sum(iv) as iv').collect()[0][0]  
    
    
    return cross.select('feature', 'feature_value', 'woe', 'iv'), iv


In [249]:
a, b = calculate_woe_iv(dfs=dfs_test,feature='dumb_cat',target='treat')

In [250]:
a.show()

+--------+-------------+-------------------+--------------------+
| feature|feature_value|                woe|                  iv|
+--------+-------------+-------------------+--------------------+
|dumb_cat|            a|0.04818133614453751|0.002265024120408...|
|dumb_cat|            b| -4.359756680313845| 0.20495392677358656|
+--------+-------------+-------------------+--------------------+



In [251]:
b

0.20721895089399464

In [299]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.ml.feature import QuantileDiscretizer


def feature_selection_iv(dfs, target, num_features, cat_features, floor_iv=0.3, floor_bucket_percent=0.1, categorical_as_woe=False):
    # msm teste de num_features e cat_features da get_featues_vector

    schema_woe = StructType([
        StructField("feature", StringType(), False),
        StructField("feature_value", StringType(), True),
        StructField("woe", FloatType(), True),
        StructField("iv", FloatType(), True)
    ])
    dfs_woe = spark.createDataFrame([], schema_woe)


    schema_iv = StructType([
        StructField('feature', StringType(), False),
        StructField('iv', FloatType(), False)
    ])
    dfs_iv = spark.createDataFrame([], schema_iv)


    if num_features is not None:
        count_dfs = dfs.count()
        nBuckets = count_dfs/(count_dfs*floor_bucket_percent)

        bucket_num_features = [i + '_bucket' for i in num_features]
        qt = QuantileDiscretizer(inputCols=num_features, outputCols=bucket_num_features, numBuckets=nBuckets)
        dfs = qt.fit(dfs).transform(dfs)

    
    # criar aqui logica tbm
    feats = bucket_num_features + cat_features

    for f in feats:
        df_woe_feature, iv = calculate_woe_iv(dfs=dfs,feature=f,target=target)
    
        dfs_woe = dfs_woe.union(df_woe_feature)
        dfs_iv = dfs_iv.union(spark.createDataFrame(pd.DataFrame({'feature':[f],'iv':[iv]})))

    
    cols_to_keep = dfs_iv.filter(f'iv >= {floor_iv}').toPandas()['feature'].to_list()

    if categorical_as_woe:
        selected_features = [s[:-7] if s.endswith('_bucket') else s+'_woe' for s in cols_to_keep]
        
        stages_features_vector = [WeightOfEvidenceComputer(inputCols=cat_features, target=target)] + get_features_vector(num_features=selected_features)
    else:
        num_selectec_features = list(filter(None, [s[:-6] if s.endswith('_bucket') else None for s in cols_to_keep]))
        cat_selectec_features = list(filter(None, [None if s.endswith('_bucket') else s for s in cols_to_keep]))
        
        stages_features_vector = get_features_vector(num_features=num_selectec_features, cat_features=cat_selectec_features)

    out_dict = {
        'dfs_woe': dfs_woe,
        'dfs_iv': dfs_iv.orderBy(F.col('iv').desc()),
        'stages_features_vector': stages_features_vector
    }


    return out_dict

In [300]:
out = feature_selection_iv(dfs=dfs_train, target='treat', num_features=num_features, cat_features=cat_features, categorical_as_woe=True)

In [301]:
out.keys()

dict_keys(['dfs_woe', 'dfs_iv', 'stages_features_vector'])

In [302]:
out['stages_features_vector']

[WeightOfEvidenceComputer_7434319b69e4,
 VectorAssembler_ad01e946afa1,
 VectorAssembler_d5038197cf6e]

In [303]:
out['dfs_woe'].show(5)

+----------+-------------+--------------------+--------------------+
|   feature|feature_value|                 woe|                  iv|
+----------+-------------+--------------------+--------------------+
|age_bucket|          0.0| -0.5972201927175608| 0.03755893994515182|
|age_bucket|          5.0|  0.4674214381624044|0.019477216762649557|
|age_bucket|          1.0|-0.42042489967372465|0.020182294438309992|
|age_bucket|          6.0|  0.9472248535629343| 0.05228097899890407|
|age_bucket|          9.0|                 0.0|                 0.0|
+----------+-------------+--------------------+--------------------+
only showing top 5 rows



In [304]:
out['dfs_iv'].show(5)

+-----------+------------------+
|    feature|                iv|
+-----------+------------------+
|    data_id| 4.081508165850559|
|      etnia| 4.076126036555841|
|educ_bucket|1.0052276924505816|
| age_bucket|0.4417967451554041|
|   dumb_cat|0.2845468150802483|
+-----------+------------------+



In [305]:
p = Pipeline(stages=out['stages_features_vector'])
fitted = p.fit(dfs_train)

In [306]:
test = fitted.transform(dfs_test)
test.show()

+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+--------------------+--------------------+
|index| age|educ|             data_id|etnia|dumb_cat|treat|       data_id_woe|          etnia_woe|       dumb_cat_woe|                 num|            features|
+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+--------------------+--------------------+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|-4.090093113364742|-2.3430534792674322| -4.359756680313845|[37.0,11.0,-4.090...|[37.0,11.0,-4.090...|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|               0.0|  3.234624562237972| -4.359756680313845|[48.0,6.0,0.0,3.2...|[48.0,6.0,0.0,3.2...|
|    7|18.0|11.0|                CPS1|other|       b|    0|               0.0| 1.2022061808577313| -4.359756680313845|[18.0,11.0,0.0,1....|[18.0,11.0,0.0,1....|
|   10|19.0| 9.0|Dehejia-Wahba Sam

In [307]:
type(fitted)

pyspark.ml.pipeline.PipelineModel

In [308]:
fitted.save("tmp/food_pipeline.model")

In [309]:
from pyspark.ml import PipelineModel

In [310]:
fitted2 = PipelineModel.read().load("tmp/food_pipeline.model")

In [311]:
test = fitted2.transform(dfs_test)
test.show()

+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+--------------------+--------------------+
|index| age|educ|             data_id|etnia|dumb_cat|treat|       data_id_woe|          etnia_woe|       dumb_cat_woe|                 num|            features|
+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+--------------------+--------------------+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|-4.090093113364742|-2.3430534792674322| -4.359756680313845|[37.0,11.0,-4.090...|[37.0,11.0,-4.090...|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|               0.0|  3.234624562237972| -4.359756680313845|[48.0,6.0,0.0,3.2...|[48.0,6.0,0.0,3.2...|
|    7|18.0|11.0|                CPS1|other|       b|    0|               0.0| 1.2022061808577313| -4.359756680313845|[18.0,11.0,0.0,1....|[18.0,11.0,0.0,1....|
|   10|19.0| 9.0|Dehejia-Wahba Sam

## Wip Transformer WOE


In [277]:
def add_woe(dfs, feature, target):
    dfs_woe_feature, iv = calculate_woe_iv(dfs=dfs,feature=feature,target=target)
    dfs_woe_feature = dfs_woe_feature\
        .withColumnRenamed('feature_value', feature)\
        .withColumnRenamed('woe', f'{feature}_woe')\
        .select(feature, f'{feature}_woe')


    cols = dfs.columns + [f'{feature}_woe']
    dfs = dfs.join(dfs_woe_feature, on=feature, how='left').select(cols)
    return dfs

In [279]:
add_woe(dfs=dfs_train, feature='data_id', target='treat').show()

+-----+----+----+--------------------+-----+--------+-----+-------------------+
|index| age|educ|             data_id|etnia|dumb_cat|treat|        data_id_woe|
+-----+----+----+--------------------+-----+--------+-----+-------------------+
|    0|45.0|11.0|                CPS1| marr|       b|    0|                0.0|
|    1|21.0|14.0|                CPS1|other|       b|    0|                0.0|
|    1|22.0| 9.0|Dehejia-Wahba Sample| hisp|       b|    1|-4.1470758465965325|
|    2|30.0|12.0|Dehejia-Wahba Sample|black|       b|    1|-4.1470758465965325|
|    2|38.0|12.0|                CPS1| marr|       b|    0|                0.0|
|    3|27.0|11.0|Dehejia-Wahba Sample|black|       b|    1|-4.1470758465965325|
|    4|18.0| 8.0|                CPS1| marr|       b|    0|                0.0|
|    4|33.0| 8.0|Dehejia-Wahba Sample|black|       b|    1|-4.1470758465965325|
|    5|22.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|-4.1470758465965325|
|    5|22.0|11.0|                CPS1| m

In [280]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasInputCols, HasOutputCol, HasOutputCols
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class WeightOfEvidenceComputer(Transformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, DefaultParamsReadable, DefaultParamsWritable):
    
    target = Param(
        parent=Params._dummy(),
        name='target',
        doc='Column name of the target. Must be a integer os values 0 or 1.',
        typeConverter=TypeConverters.toString
    )

    @keyword_only
    def __init__(
        self,
        inputCol=None,
        # outputCol=None,
        inputCols=None,
        # outputCols=None,
        target=None,
    ):
        super().__init__()
        self._setDefault(target=None)
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(
        self,
        inputCol=None,
        # outputCol=None,
        inputCols=None,
        # outputCols=None,
        target=None,
    ):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setTarget(self, new_target):
        return self.setParams(target=new_target)

    def setInputCol(self, new_inputCol):
        return self.setParams(inputCol=new_inputCol)

    # def setOutputCol(self, new_outputCol):
    #     return self.setParams(outputCol=new_outputCol)

    def setInputCols(self, new_inputCols):
        return self.setParams(inputCols=new_inputCols)

    # def setOutputCols(self, new_outputCols):
    #     return self.setParams(outputCols=new_outputCols)

    def getTarget(self):
        return self.getOrDefault(self.target)
    
    def checkParams(self):
        # Test #1: either inputCol or inputCols can be set (but not both).
        if self.isSet("inputCol") and (self.isSet("inputCols")):
            raise ValueError(
                "Only one of `inputCol` and `inputCols`" "must be set."
            )

        # Test #2: at least one of inputCol or inputCols must be set.
        if not (self.isSet("inputCol") or self.isSet("inputCols")):
            raise ValueError(
                "One of `inputCol` or `inputCols` must be set."
            )

        # Test #3: if `inputCols` is set, then `outputCols`
        # must be a list of the same len()
        # if self.isSet("inputCols"):
        #     if len(self.getInputCols()) != len(self.getOutputCols()):
        #         raise ValueError(
        #             "The length of `inputCols` does not match"
        #             " the length of `outputCols`"
        #         )

    def _transform(self, dataset):
        self.checkParams()

        # If `inputCol` / `outputCol`, we wrap into a single-item list
        input_columns = (
            [self.getInputCol()]
            if self.isSet("inputCol")
            else self.getInputCols()
        )
        # output_columns = (
        #     [self.getOutputCol()]
        #     if self.isSet("outputCol")
        #     else self.getOutputCols()
        # )

        # answer = dataset

        for feat in input_columns:
            dataset = add_woe(dfs=dataset, feature=feat, target=self.getTarget())
        # # If input_columns == output_columns, we overwrite and no need to create new columns.
        # if input_columns != output_columns:
        #     for in_col, out_col in zip(input_columns, output_columns):
        #         answer = answer.withColumn(out_col, F.col(in_col))

                

        # na_filler = self.getFiller()
        return dataset#.fillna(na_filler, output_columns)

In [281]:
test_woe = WeightOfEvidenceComputer(inputCols=cat_features, target='treat')

test_woe.transform(dfs_test).show()

+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+
|index| age|educ|             data_id|etnia|dumb_cat|treat|       data_id_woe|          etnia_woe|       dumb_cat_woe|
+-----+----+----+--------------------+-----+--------+-----+------------------+-------------------+-------------------+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|-4.090093113364742|-2.3430534792674322| -4.359756680313845|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|               0.0|  3.234624562237972| -4.359756680313845|
|    7|18.0|11.0|                CPS1|other|       b|    0|               0.0| 1.2022061808577313| -4.359756680313845|
|   10|19.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|-4.090093113364742|-2.3430534792674322| -4.359756680313845|
|   12|18.0| 8.0|Dehejia-Wahba Sample|black|       a|    1|-4.090093113364742|-2.3430534792674322|0.04818133614453751|
|   13|19.0|12.0|                CPS1|other|    

In [184]:
dfs_woe.show()

+----------+--------------------+--------------------+--------------------+
|   feature|       feature_value|                 woe|                  iv|
+----------+--------------------+--------------------+--------------------+
| age_quant|                 0.0| -0.5972201927175608| 0.03755893994515182|
| age_quant|                 5.0|  0.4674214381624044|0.019477216762649557|
| age_quant|                 1.0|-0.42042489967372465|0.020182294438309992|
| age_quant|                 6.0|  0.9472248535629343| 0.05228097899890407|
| age_quant|                 9.0|                 0.0|                 0.0|
| age_quant|                 2.0| -0.5239091623421057|0.034363226987371216|
| age_quant|                 7.0|  0.5402080717439384| 0.02164954344664573|
| age_quant|                 3.0| -0.8934823718292055|  0.1402662918441354|
| age_quant|                 8.0|  1.3837121700365596|  0.1157164450269391|
| age_quant|                 4.0| 0.05293162458232201|3.018077052971832...|
|educ_quant|

In [185]:
dfs_iv.show()

+----------+------------------+
|   feature|                iv|
+----------+------------------+
| age_quant|0.4417967451554041|
|educ_quant|1.0052276924505816|
|   data_id| 4.081508165850559|
|     etnia| 4.076126036555841|
|  dumb_cat|0.2845468150802483|
+----------+------------------+



In [190]:
cols_to_keep

['age_quant', 'educ_quant', 'data_id', 'etnia']

In [197]:
# categorical as woe True
[s[:-6] if s.endswith('_quant') else s+'_woe' for s in cols_to_keep]

['age', 'educ', 'data_id_woe', 'etnia_woe']

In [203]:
# categorical as woe False
num_selectec_features = list(filter(None, [s[:-6] if s.endswith('_quant') else None for s in cols_to_keep]))

cat_selectec_features = list(filter(None, [None if s.endswith('_quant') else s for s in cols_to_keep]))

print(num_selectec_features)
print(cat_selectec_features)

['age', 'educ']
['data_id', 'etnia']


In [120]:
dfs_test.show(5)

+-----+--------------------+-----+--------+-----+
|index|             data_id|etnia|dumb_cat|treat|
+-----+--------------------+-----+--------+-----+
|    0|                CPS1| marr|       b|    0|
|    3|Dehejia-Wahba Sample|black|       b|    1|
|    7|                CPS1|other|       b|    0|
|   10|                CPS1| marr|       b|    0|
|   12|                CPS1| marr|       a|    0|
+-----+--------------------+-----+--------+-----+
only showing top 5 rows



In [122]:
dfs_woe.show()

+--------+--------------------+--------------------+--------------------+
| feature|       feature_value|                 woe|                  iv|
+--------+--------------------+--------------------+--------------------+
| data_id|Dehejia-Wahba Sample| -4.0928282617041365|   4.024510946258392|
| data_id|                CPS1|                 0.0|                 0.0|
|   etnia|               other|  0.4487629426410833|0.038510203869811445|
|   etnia|               black|  -2.254043853831219|  1.6506230367881973|
|   etnia|                marr|   2.994244065051656|  1.7212600705769427|
|   etnia|                hisp|                 0.0|                 0.0|
|dumb_cat|                   a|0.029860773527106543|8.776845743563007E-4|
|dumb_cat|                   b| -3.5050415968020174| 0.10302214506257643|
+--------+--------------------+--------------------+--------------------+



In [None]:
df.groupBy("A", "B").pivot("C").sum("D")

In [93]:
dfs_woe.groupBy('feature').agg(F.sum('iv').alias('iv')).show()

+--------+-------------------+
| feature|                 iv|
+--------+-------------------+
| data_id|  4.024510946258392|
|   etnia| 3.4103933112349516|
|dumb_cat|0.10389982963693273|
+--------+-------------------+



+-----------+----+---+
|etnia_treat|   0|  1|
+-----------+----+---+
|      other| 782|  5|
|      black| 283| 27|
|       marr|1994|  1|
|       hisp| 236|  0|
+-----------+----+---+



In [15]:
dfs_test.select(target).distinct().count()

2

## Fitting the Baseline Classifiers

In [4]:
base_line_out = baseline_binary_classfiers(
    dfs=dfs_train,
    id_col='index',
    target_col='treat',
    num_features=['age', 'educ', 'nodegree', 're74', 're75', 're78', 'age2', 'age3', 'educ2', 'educ_re74', 'u74', 'u75'],
    cat_features=['data_id', 'etnia'],
    dfs_test=dfs_test,
    weight_on_target=True,
    log_mlflow_run=False,
    artifact_stage_path = None
)

Computing Features Vector


                                                                                

Computing Class Weights


                                                                                

Instanciating Classifiers


                                                                                

Predicting on Test Data and Evaluating


100%|██████████| 4/4 [00:19<00:00,  4.95s/it]                                   


In [6]:
base_line_out.keys()

dict_keys(['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'GBTClassifier'])

In [None]:
base_line_out['LogisticRegression'].keys()

dict_keys(['model', 'metrics', 'decile_metrics'])

In [7]:
base_line_out['LogisticRegression']['metrics']

{'confusion_matrix':    treat  prediction  count
 0    0.0         0.0   3240
 1    1.0         1.0     33
 2    0.0         1.0     55,
 'accuracy': 0.9834735576923077,
 'f1': 0.5454545454545454,
 'precision': 0.375,
 'recall': 1.0,
 'aucroc': 0.9916540212443096,
 'aucpr': 0.375}