# Information Value as Feature Selection for Binary Classification Problems


## Session Setup

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.get_option("display.max_columns")

from pyspark.sql import SparkSession, functions as F
from pyspark.ml import Pipeline

from pyspark_ds_toolbox.ml.feature_selection import feature_selection_with_iv



In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/30 00:21:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## lendo o dataset base

In [3]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df)\
    .withColumn('etnia', F.expr('case when black=1 then "black" when hisp=1 then "hisp" when marr=1 then "marr" else "other" end'))\
    .withColumn('treat', F.col('treat').cast('int'))\
    .withColumn('dumb_cat', F.expr('case when index > 10 then "a" else "b" end'))\
    .select('index', 'age', 'educ', 'data_id', 'etnia','dumb_cat', 'treat')



dfs_train, dfs_test = df.randomSplit([0.8, 0.2], seed=4)
dfs_test.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+----+----+--------------------+-----+--------+-----+
|index| age|educ|             data_id|etnia|dumb_cat|treat|
+-----+----+----+--------------------+-----+--------+-----+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|
|    7|18.0|11.0|                CPS1|other|       b|    0|
|   10|19.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|
|   12|18.0| 8.0|Dehejia-Wahba Sample|black|       a|    1|
+-----+----+----+--------------------+-----+--------+-----+
only showing top 5 rows



                                                                                

## Feature Selection and Encoding 1

Supose we want to predict the `treat` column with all the other columns, except `index`. We will call the `feature_selection_with_iv()` function the select only the features with a information value greater or equal then 0.3. We will also encode the categorical features with their WOE, instead of doing a one hot encoding.

By definition we know that column `dumb_cat` should be be taken into account, since it adds no information.

In [4]:
result = feature_selection_with_iv(
    dfs=dfs_train,
    col_target='treat',
    cat_features=['data_id', 'etnia', 'dumb_cat'],
    num_features=['age', 'educ'],
    floor_iv=0.3,
    bucket_fraction=0.1,
    categorical_as_woe=True
)

                                                                                

This function calss returns a dictionary with the following keys

In [5]:
result.keys()

dict_keys(['dfs_woe', 'dfs_iv', 'stages_features_vector'])

`result['dfs_woe']` has all feature values weight of evidence and information value.

In [6]:
result['dfs_woe'].show(5)

+----------+-------------+--------------------+--------------------+
|   feature|feature_value|                 woe|                  iv|
+----------+-------------+--------------------+--------------------+
|age_bucket|          0.0| -0.5972201927175608| 0.03755893994515182|
|age_bucket|          5.0|  0.4674214381624044|0.019477216762649557|
|age_bucket|          1.0|-0.42042489967372465|0.020182294438309992|
|age_bucket|          6.0|  0.9472248535629343| 0.05228097899890407|
|age_bucket|          9.0|                 0.0|                 0.0|
+----------+-------------+--------------------+--------------------+
only showing top 5 rows



`result['dfs_iv']` has each feature information value

In [7]:
result['df_iv'].head()

+-----------+------------------+
|    feature|                iv|
+-----------+------------------+
|    data_id| 4.081508165850559|
|      etnia| 4.076126036555841|
|educ_bucket|1.0052276924505816|
| age_bucket|0.4417967451554041|
|   dumb_cat|0.2845468150802483|
+-----------+------------------+



And finally result['stages_features_vector'] has a list os transformers and estimators to create a features vector column with the selected variables and categorical encoding bases on `floor_iv` and `categorical_as_woe` params.

In [8]:
result['stages_features_vector']

[WeightOfEvidenceComputer_8752fb7b720b,
 VectorAssembler_b64f31fe701a,
 VectorAssembler_c70d931586a2]

In [9]:
pipeline = Pipeline(stages=result['stages_features_vector'])

pipeline_fitted = pipeline.fit(dfs_train)

pipeline_fitted.transform(dfs_test).toPandas().head()

Unnamed: 0,index,age,educ,data_id,etnia,dumb_cat,treat,data_id_woe,etnia_woe,num,features
0,0,37.0,11.0,Dehejia-Wahba Sample,black,b,1,-4.090093,-2.343053,"[37.0, 11.0, -4.090093113364742, -2.3430534792...","[37.0, 11.0, -4.090093113364742, -2.3430534792..."
1,3,48.0,6.0,CPS1,marr,b,0,0.0,3.234625,"[48.0, 6.0, 0.0, 3.234624562237972]","[48.0, 6.0, 0.0, 3.234624562237972]"
2,7,18.0,11.0,CPS1,other,b,0,0.0,1.202206,"[18.0, 11.0, 0.0, 1.2022061808577313]","[18.0, 11.0, 0.0, 1.2022061808577313]"
3,10,19.0,9.0,Dehejia-Wahba Sample,black,b,1,-4.090093,-2.343053,"[19.0, 9.0, -4.090093113364742, -2.34305347926...","[19.0, 9.0, -4.090093113364742, -2.34305347926..."
4,12,18.0,8.0,Dehejia-Wahba Sample,black,a,1,-4.090093,-2.343053,"[18.0, 8.0, -4.090093113364742, -2.34305347926...","[18.0, 8.0, -4.090093113364742, -2.34305347926..."


As expected we only have WOE column for `data_id` and `etnia`.

## Feature Selection and Encoding 2

If we just change the param `categorical_as_woe` we will have the same result but with the selected categorical features being encoded with one hot encoding, which is the default behavior of `pyspark_ds_toolbox.ml.data_prep.features_vector.get_features_vector()`.

In [13]:
result = feature_selection_with_iv(
    dfs=dfs_train,
    col_target='treat',
    cat_features=['data_id', 'etnia', 'dumb_cat'],
    num_features=['age', 'educ'],
    floor_iv=0.3,
    bucket_fraction=0.1,
    categorical_as_woe=False
)

In [14]:
result['stages_features_vector']

[StringIndexer_302f336d0c3c,
 StringIndexer_ea57a18d6a3a,
 OneHotEncoder_f93f5ce803c2,
 OneHotEncoder_de4141bdbc59,
 VectorAssembler_5b681f30defe,
 VectorAssembler_c87edb123b6f,
 VectorAssembler_6383377847e0]

In [15]:
pipeline = Pipeline(stages=result['stages_features_vector'])

pipeline_fitted = pipeline.fit(dfs_train)

pipeline_fitted.transform(dfs_test).toPandas().head()

                                                                                

Unnamed: 0,index,age,educ,data_id,etnia,dumb_cat,treat,data_id_indexed,etnia_indexed,data_id_indexed_encoded,etnia_indexed_encoded,cat,num,features
0,0,37.0,11.0,Dehejia-Wahba Sample,black,b,1,1.0,2.0,(0.0),"(0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0)","[37.0, 11.0]","[37.0, 11.0, 0.0, 0.0, 0.0, 1.0]"
1,3,48.0,6.0,CPS1,marr,b,0,0.0,0.0,(1.0),"(1.0, 0.0, 0.0)","[1.0, 1.0, 0.0, 0.0]","[48.0, 6.0]","[48.0, 6.0, 1.0, 1.0, 0.0, 0.0]"
2,7,18.0,11.0,CPS1,other,b,0,0.0,1.0,(1.0),"(0.0, 1.0, 0.0)","[1.0, 0.0, 1.0, 0.0]","[18.0, 11.0]","[18.0, 11.0, 1.0, 0.0, 1.0, 0.0]"
3,10,19.0,9.0,Dehejia-Wahba Sample,black,b,1,1.0,2.0,(0.0),"(0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0)","[19.0, 9.0]","[19.0, 9.0, 0.0, 0.0, 0.0, 1.0]"
4,12,18.0,8.0,Dehejia-Wahba Sample,black,a,1,1.0,2.0,(0.0),"(0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0)","[18.0, 8.0]","[18.0, 8.0, 0.0, 0.0, 0.0, 1.0]"
