# Feature Selection Using Feature Importance Score - Creating a PySpark Estimator

The code was coming from [here](https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/).

The variable importance can be used in other analysis.

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SQLContext

In [2]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

In [3]:
from pyspark.sql.functions import *
from pyspark.ml.classification import  RandomForestClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, VectorSlicer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [4]:
sc = SparkContext()
spark = SQLContext(sc)

In [5]:
## data
## where: https://archive.ics.uci.edu/ml/machine-learning-databases/00222/

data_file = './data/bank-additional/bank-additional-full.csv'

In [11]:
df = spark.read.option("delimiter", ";").csv(data_file,header = True, inferSchema = True)

In [12]:
df.count()

41188

In [14]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('month', 'string'),
 ('day_of_week', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('emp.var.rate', 'double'),
 ('cons.price.idx', 'double'),
 ('cons.conf.idx', 'double'),
 ('euribor3m', 'double'),
 ('nr.employed', 'double'),
 ('y', 'string')]

In [15]:
## rename the column names
df = df.toDF(*(c.replace('.', '_') for c in df.columns))

In [16]:
df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [17]:
df.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,count,41188.0,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188.0,41188.0,41188.0,41188.0,41188,41188.0,41188.0,41188.0,41188.0,41188.0,41188
1,mean,40.02406040594348,,,,,,,,,,258.2850101971448,2.567592502670681,962.4754540157328,0.1729629989317276,,0.0818855006317839,93.57566436828918,-40.50260027191787,3.6212908128585366,5167.035910944004,
2,stddev,10.421249980934055,,,,,,,,,,259.27924883646455,2.770013542902331,186.9109073447414,0.4949010798392892,,1.57095974051703,0.5788400489541355,4.628197856174595,1.7344474048512557,72.25152766825924,
3,min,17.0,admin.,divorced,basic.4y,no,no,no,cellular,apr,fri,0.0,1.0,0.0,0.0,failure,-3.4,92.201,-50.8,0.634,4963.6,no
4,max,98.0,unknown,unknown,unknown,yes,yes,yes,telephone,sep,wed,4918.0,56.0,999.0,7.0,success,1.4,94.767,-26.9,5.045,5228.1,yes


#### how many categories in each string variables?

In [23]:
for i in df.dtypes:
    if i[1]=='string':
        df.groupby(i[0]).count().orderBy('count', ascending=False).toPandas()

### Data preprocessing

In [24]:
# one hot encoding and assembling
encoding_var = [i[0] for i in df.dtypes if (i[1]=='string') & (i[0]!='y')]
num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

'''from string to interger'''
string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
'''from interger to binary vectors'''
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'y', outputCol = 'label', handleInvalid = 'keep')


In [28]:
## The input for the model should be binary vectors
assembler = VectorAssembler(inputCols = num_var + ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])

In [29]:
mod = pipe.fit(df)

In [30]:
df2 = mod.transform(df)

In [31]:
mod.stages[-1].featureImportances

SparseVector(63, {0: 0.0259, 1: 0.1492, 2: 0.0044, 3: 0.2999, 4: 0.0346, 5: 0.0074, 6: 0.0809, 7: 0.0278, 8: 0.1946, 9: 0.0373, 10: 0.0002, 13: 0.0001, 16: 0.0002, 18: 0.0005, 19: 0.0003, 20: 0.0002, 22: 0.0006, 24: 0.0, 27: 0.0008, 29: 0.0005, 34: 0.0002, 35: 0.0001, 38: 0.0003, 39: 0.0003, 40: 0.0003, 42: 0.0005, 43: 0.0286, 44: 0.0175, 45: 0.0022, 46: 0.0007, 47: 0.0016, 48: 0.0064, 50: 0.001, 51: 0.0178, 52: 0.012, 53: 0.0051, 54: 0.0037, 55: 0.0005, 56: 0.0004, 58: 0.0008, 59: 0.0004, 60: 0.0145, 61: 0.0047, 62: 0.0149})

In [32]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [38]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(10)


Unnamed: 0,idx,name,score
3,3,pdays,0.2999
8,8,euribor3m,0.194558
1,1,duration,0.149237
6,6,cons_price_idx,0.080941
9,9,nr_employed,0.037304
4,4,previous,0.034562
43,43,OHE_contact_cellular,0.028582
7,7,cons_conf_idx,0.027836
0,0,age,0.025932
51,51,OHE_month_oct,0.017762


### re-ran the model using top-10 variables

In [40]:
varlist = ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features")

In [41]:
varidx = [x for x in varlist['idx'][0:10]]

### only used these 10 variables to build the model

In [42]:
slicer = VectorSlicer(inputCol="features", outputCol="features2", indices=varidx)
df3 = slicer.transform(df2)

In [48]:
df3 = df3.drop('rawPrediction', 'probability', 'prediction')
rf2 = RandomForestClassifier(labelCol="label", featuresCol="features2", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)
mod2 = rf2.fit(df3)
df4 = mod2.transform(df3)

In [49]:
from FeatureImportanceSelector import ExtractFeatureImp, FeatureImpSelector

In [50]:
# one hot encoding and assembling
encoding_var = [i[0] for i in df.dtypes if (i[1]=='string') & (i[0]!='y')]
num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'y', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = num_var + ['OHE_' + c for c in encoding_var], outputCol = "features")

rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)
fis = FeatureImpSelector(estimator = rf, selectorType = "numTopFeatures",
                         numTopFeatures = 10, outputCol = "features_subset")
rf2 = RandomForestClassifier(labelCol="label", featuresCol="features_subset", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, fis, rf2])

In [51]:
pipeline_mod = pipe.fit(df)

In [52]:
df2 = pipeline_mod.transform(df)

In [53]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features_subset")

Unnamed: 0,idx,name,score
3,3,cons_price_idx,0.2999
7,8,age,0.194558
1,1,euribor3m,0.149237
8,6,OHE_contact_cellular,0.080941
9,9,OHE_month_oct,0.037304
4,4,nr_employed,0.034562
6,7,cons_conf_idx,0.027836
0,0,pdays,0.025932
5,5,previous,0.007404
2,2,duration,0.00436
