# Information Value as Feature Selection for Binary Classification Problems


## Session Setup

In [187]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F


from pyspark_ds_toolbox.ml.data_prep.features_vector import get_features_vector

In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/29 14:18:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## lendo o dataset base

In [137]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df)\
    .withColumn('etnia', F.expr('case when black=1 then "black" when hisp=1 then "hisp" when marr=1 then "marr" else "other" end'))\
    .withColumn('treat', F.col('treat').cast('int'))\
    .withColumn('dumb_cat', F.expr('case when index > 10 then "a" else "b" end'))\
    .select('index', 'age', 'educ', 'data_id', 'etnia','dumb_cat', 'treat')



dfs_train, dfs_test = df.randomSplit([0.8, 0.2], seed=4)
dfs_test.show(5)

+-----+----+----+--------------------+-----+--------+-----+
|index| age|educ|             data_id|etnia|dumb_cat|treat|
+-----+----+----+--------------------+-----+--------+-----+
|    0|37.0|11.0|Dehejia-Wahba Sample|black|       b|    1|
|    3|48.0| 6.0|                CPS1| marr|       b|    0|
|    7|18.0|11.0|                CPS1|other|       b|    0|
|   10|19.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|
|   12|18.0| 8.0|Dehejia-Wahba Sample|black|       a|    1|
+-----+----+----+--------------------+-----+--------+-----+
only showing top 5 rows



In [138]:
num_features = ['age', 'educ']
cat_features = ['data_id', 'etnia', 'dumb_cat']

In [139]:
from pyspark.ml import Pipeline

+-----+----+----+--------------------+-----+--------+-----+---------+----------+
|index| age|educ|             data_id|etnia|dumb_cat|treat|age_quant|educ_quant|
+-----+----+----+--------------------+-----+--------+-----+---------+----------+
|    0|45.0|11.0|                CPS1| marr|       b|    0|      8.0|       3.0|
|    1|21.0|14.0|                CPS1|other|       b|    0|      1.0|       6.0|
|    1|22.0| 9.0|Dehejia-Wahba Sample| hisp|       b|    1|      2.0|       1.0|
|    2|30.0|12.0|Dehejia-Wahba Sample|black|       b|    1|      4.0|       4.0|
|    2|38.0|12.0|                CPS1| marr|       b|    0|      6.0|       4.0|
|    3|27.0|11.0|Dehejia-Wahba Sample|black|       b|    1|      3.0|       3.0|
|    4|18.0| 8.0|                CPS1| marr|       b|    0|      0.0|       1.0|
|    4|33.0| 8.0|Dehejia-Wahba Sample|black|       b|    1|      5.0|       1.0|
|    5|22.0| 9.0|Dehejia-Wahba Sample|black|       b|    1|      2.0|       1.0|
|    5|22.0|11.0|           

In [175]:
def calculate_woe_iv(dfs, feature, target):
    # testar se o tipo da coluna target é int e os valores unicos são 0 e 1
    cross = dfs\
        .crosstab(feature, target)\
        .withColumnRenamed(f'{feature}_{target}', 'feature_value')
        
    sum_0 = cross.select('0').groupBy().sum().collect()[0][0]
    sum_1 = cross.select('1').groupBy().sum().collect()[0][0]

    
    cross = cross\
        .withColumn('0', F.col('0')/sum_0)\
        .withColumn('1', F.col('1')/sum_1)\
        .withColumn('woe', F.log(F.col('0')/F.col('1')))\
        .withColumn('iv', F.col('woe')*(F.col('0') - F.col('1')))\
        .withColumn('feature', F.lit(feature))


    iv = cross.selectExpr('sum(iv) as iv').collect()[0][0]  
    
    
    return cross.select('feature', 'feature_value', 'woe', 'iv'), iv


In [176]:
a, b = calculate_woe_iv(dfs=dfs_test,feature='dumb_cat',target='treat')

In [177]:
a.show()

+--------+-------------+-------------------+--------------------+
| feature|feature_value|                woe|                  iv|
+--------+-------------+-------------------+--------------------+
|dumb_cat|            a|0.04818133614453751|0.002265024120408...|
|dumb_cat|            b| -4.359756680313845| 0.20495392677358656|
+--------+-------------+-------------------+--------------------+



In [178]:
b

0.20721895089399464

In [210]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.ml.feature import QuantileDiscretizer


def feature_selection_iv(dfs, target, num_features, cat_features, floor_iv=0.3, floor_bucket_percent=0.1, categorical_as_woe=False):
    # msm teste de num_features e cat_features da get_featues_vector

    schema_woe = StructType([
        StructField("feature", StringType(), False),
        StructField("feature_value", StringType(), True),
        StructField("woe", FloatType(), True),
        StructField("iv", FloatType(), True)
    ])
    dfs_woe = spark.createDataFrame([], schema_woe)


    schema_iv = StructType([
        StructField('feature', StringType(), False),
        StructField('iv', FloatType(), False)
    ])
    dfs_iv = spark.createDataFrame([], schema_iv)


    if num_features is not None:
        count_dfs = dfs.count()
        nBuckets = count_dfs/(count_dfs*floor_bucket_percent)

        bucket_num_features = [i + '_bucket' for i in num_features]
        qt = QuantileDiscretizer(inputCols=num_features, outputCols=bucket_num_features, numBuckets=nBuckets)
        dfs = qt.fit(dfs).transform(dfs)

    
    # criar aqui logica tbm
    feats = bucket_num_features + cat_features

    for f in feats:
        df_woe_feature, iv = calculate_woe_iv(dfs=dfs,feature=f,target=target)
    
        dfs_woe = dfs_woe.union(df_woe_feature)
        dfs_iv = dfs_iv.union(spark.createDataFrame(pd.DataFrame({'feature':[f],'iv':[iv]})))

    dfs_woe = dfs_woe.fillna(0)
    cols_to_keep = dfs_iv.filter(f'iv >= {floor_iv}').toPandas()['feature'].to_list()

    if categorical_as_woe:
        selected_features = [s[:-6] if s.endswith('_bucket') else s+'_woe' for s in cols_to_keep]
        
        stages_features_vector = get_features_vector(num_features=selected_features)
    else:
        num_selectec_features = list(filter(None, [s[:-6] if s.endswith('_bucket') else None for s in cols_to_keep]))
        cat_selectec_features = list(filter(None, [None if s.endswith('_bucket') else s for s in cols_to_keep]))
        
        stages_features_vector = get_features_vector(num_features=num_selectec_features, cat_features=cat_selectec_features)


    return stages_features_vector

In [215]:
feature_selection_iv(dfs=dfs_test, target='treat', num_features=num_features, cat_features=cat_features, categorical_as_woe=False)

[StringIndexer_4981e2a02208,
 StringIndexer_d3e376f831d1,
 OneHotEncoder_1c0e8643899c,
 OneHotEncoder_8e196dce3c95,
 VectorAssembler_185b4ea0b1f9,
 VectorAssembler_6836010161ca,
 VectorAssembler_aced8b1fc5ba]

In [189]:
prep_stages

[QuantileDiscretizer_88984347f251,
 VectorAssembler_ef92014b7edf,
 VectorAssembler_89b80508a460]

In [184]:
dfs_woe.show()

+----------+--------------------+--------------------+--------------------+
|   feature|       feature_value|                 woe|                  iv|
+----------+--------------------+--------------------+--------------------+
| age_quant|                 0.0| -0.5972201927175608| 0.03755893994515182|
| age_quant|                 5.0|  0.4674214381624044|0.019477216762649557|
| age_quant|                 1.0|-0.42042489967372465|0.020182294438309992|
| age_quant|                 6.0|  0.9472248535629343| 0.05228097899890407|
| age_quant|                 9.0|                 0.0|                 0.0|
| age_quant|                 2.0| -0.5239091623421057|0.034363226987371216|
| age_quant|                 7.0|  0.5402080717439384| 0.02164954344664573|
| age_quant|                 3.0| -0.8934823718292055|  0.1402662918441354|
| age_quant|                 8.0|  1.3837121700365596|  0.1157164450269391|
| age_quant|                 4.0| 0.05293162458232201|3.018077052971832...|
|educ_quant|

In [185]:
dfs_iv.show()

+----------+------------------+
|   feature|                iv|
+----------+------------------+
| age_quant|0.4417967451554041|
|educ_quant|1.0052276924505816|
|   data_id| 4.081508165850559|
|     etnia| 4.076126036555841|
|  dumb_cat|0.2845468150802483|
+----------+------------------+



In [190]:
cols_to_keep

['age_quant', 'educ_quant', 'data_id', 'etnia']

In [197]:
# categorical as woe True
[s[:-6] if s.endswith('_quant') else s+'_woe' for s in cols_to_keep]

['age', 'educ', 'data_id_woe', 'etnia_woe']

In [203]:
# categorical as woe False
num_selectec_features = list(filter(None, [s[:-6] if s.endswith('_quant') else None for s in cols_to_keep]))

cat_selectec_features = list(filter(None, [None if s.endswith('_quant') else s for s in cols_to_keep]))

print(num_selectec_features)
print(cat_selectec_features)

['age', 'educ']
['data_id', 'etnia']


In [120]:
dfs_test.show(5)

+-----+--------------------+-----+--------+-----+
|index|             data_id|etnia|dumb_cat|treat|
+-----+--------------------+-----+--------+-----+
|    0|                CPS1| marr|       b|    0|
|    3|Dehejia-Wahba Sample|black|       b|    1|
|    7|                CPS1|other|       b|    0|
|   10|                CPS1| marr|       b|    0|
|   12|                CPS1| marr|       a|    0|
+-----+--------------------+-----+--------+-----+
only showing top 5 rows



In [122]:
dfs_woe.show()

+--------+--------------------+--------------------+--------------------+
| feature|       feature_value|                 woe|                  iv|
+--------+--------------------+--------------------+--------------------+
| data_id|Dehejia-Wahba Sample| -4.0928282617041365|   4.024510946258392|
| data_id|                CPS1|                 0.0|                 0.0|
|   etnia|               other|  0.4487629426410833|0.038510203869811445|
|   etnia|               black|  -2.254043853831219|  1.6506230367881973|
|   etnia|                marr|   2.994244065051656|  1.7212600705769427|
|   etnia|                hisp|                 0.0|                 0.0|
|dumb_cat|                   a|0.029860773527106543|8.776845743563007E-4|
|dumb_cat|                   b| -3.5050415968020174| 0.10302214506257643|
+--------+--------------------+--------------------+--------------------+



In [None]:
df.groupBy("A", "B").pivot("C").sum("D")

In [93]:
dfs_woe.groupBy('feature').agg(F.sum('iv').alias('iv')).show()

+--------+-------------------+
| feature|                 iv|
+--------+-------------------+
| data_id|  4.024510946258392|
|   etnia| 3.4103933112349516|
|dumb_cat|0.10389982963693273|
+--------+-------------------+



+-----------+----+---+
|etnia_treat|   0|  1|
+-----------+----+---+
|      other| 782|  5|
|      black| 283| 27|
|       marr|1994|  1|
|       hisp| 236|  0|
+-----------+----+---+



In [15]:
dfs_test.select(target).distinct().count()

2

## Fitting the Baseline Classifiers

In [4]:
base_line_out = baseline_binary_classfiers(
    dfs=dfs_train,
    id_col='index',
    target_col='treat',
    num_features=['age', 'educ', 'nodegree', 're74', 're75', 're78', 'age2', 'age3', 'educ2', 'educ_re74', 'u74', 'u75'],
    cat_features=['data_id', 'etnia'],
    dfs_test=dfs_test,
    weight_on_target=True,
    log_mlflow_run=False,
    artifact_stage_path = None
)

Computing Features Vector


                                                                                

Computing Class Weights


                                                                                

Instanciating Classifiers


                                                                                

Predicting on Test Data and Evaluating


100%|██████████| 4/4 [00:19<00:00,  4.95s/it]                                   


In [6]:
base_line_out.keys()

dict_keys(['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'GBTClassifier'])

In [None]:
base_line_out['LogisticRegression'].keys()

dict_keys(['model', 'metrics', 'decile_metrics'])

In [7]:
base_line_out['LogisticRegression']['metrics']

{'confusion_matrix':    treat  prediction  count
 0    0.0         0.0   3240
 1    1.0         1.0     33
 2    0.0         1.0     55,
 'accuracy': 0.9834735576923077,
 'f1': 0.5454545454545454,
 'precision': 0.375,
 'recall': 1.0,
 'aucroc': 0.9916540212443096,
 'aucpr': 0.375}