# Extract Feature Importance From Spark Models

## Session Setup

In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import pyspark.ml.classification as spark_cl
from pyspark.ml.regression import LinearRegression

from pyspark_ds_toolbox.ml.data_prep.features_vector import get_features_vector
from pyspark_ds_toolbox.ml.feature_importance.native_spark import extract_features_score



In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/07 15:15:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/07 15:15:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Reading the Dataset

In [5]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df.drop(columns=['data_id']))\
    .withColumn('age2', F.col('age')**2)\
    .withColumn('age3', F.col('age')**3)\
    .withColumn('educ2', F.col('educ')**2)\
    .withColumn('educ_re74', F.col('educ')*F.col('re74'))\
    .withColumn('u74', F.when(F.col('re74')==0, 1).otherwise(0))\
    .withColumn('u75', F.when(F.col('re75')==0, 1).otherwise(0))\
    .withColumn('etnia', F.expr('case when black=1 then "black" when hisp=1 then "hisp" when marr=1 then "marr" else "other" end'))\
    .drop('black', 'hisp', 'marr')


num_features = ['age', 'educ', 'nodegree', 're74', 're75', 're78', 'age2', 'age3', 'educ2', 'educ_re74', 'u74', 'u75']
cat_features = ['etnia']

df = get_features_vector(df=df, num_features=num_features, cat_features=cat_features)
df.show(5)

+-----+-----+----+----+--------+----+----+-----------------+------+-------+-----+---------+---+---+-----+--------------------+
|index|treat| age|educ|nodegree|re74|re75|             re78|  age2|   age3|educ2|educ_re74|u74|u75|etnia|            features|
+-----+-----+----+----+--------+----+----+-----------------+------+-------+-----+---------+---+---+-----+--------------------+
|    0|  1.0|37.0|11.0|     1.0| 0.0| 0.0|  9930.0458984375|1369.0|50653.0|121.0|      0.0|  1|  1|black|[37.0,11.0,1.0,0....|
|    1|  1.0|22.0| 9.0|     1.0| 0.0| 0.0| 3595.89404296875| 484.0|10648.0| 81.0|      0.0|  1|  1| hisp|[22.0,9.0,1.0,0.0...|
|    2|  1.0|30.0|12.0|     0.0| 0.0| 0.0|   24909.44921875| 900.0|27000.0|144.0|      0.0|  1|  1|black|[30.0,12.0,0.0,0....|
|    3|  1.0|27.0|11.0|     1.0| 0.0| 0.0| 7506.14599609375| 729.0|19683.0|121.0|      0.0|  1|  1|black|[27.0,11.0,1.0,0....|
|    4|  1.0|33.0| 8.0|     1.0| 0.0| 0.0|289.7898864746094|1089.0|35937.0| 64.0|      0.0|  1|  1|black|[33.0,

## Fitting the Different Models

In [9]:
# Gini Score (Tree Based)
gbt = spark_cl.GBTClassifier(labelCol='treat', featuresCol='features').fit(df)
df_fi_gini = extract_features_score(model=gbt, dfs=df, features_col='features')
df_fi_gini

                                                                                

Unnamed: 0,feat_index,feature,delta_gini
0,0,num_age,0.21835
4,4,num_re75,0.205778
12,12,cat_etnia_indexed_encoded,0.193241
1,1,num_educ,0.136403
5,5,num_re78,0.116129
10,10,num_u74,0.070586
3,3,num_re74,0.024795
2,2,num_nodegree,0.017937
9,9,num_educ_re74,0.011091
11,11,num_u75,0.005689


In [7]:
# Odds Ratio (Logistic Regression)
lr = spark_cl.LogisticRegression(labelCol='treat', featuresCol='features').fit(df)
df_fi_odds_ratio = extract_features_score(model=lr, dfs=df, features_col='features')
df_fi_odds_ratio

                                                                                

Unnamed: 0,feat_index,feature,odds_ratio
10,10,num_u74,3.785102
0,0,num_age,3.710186
12,12,cat_etnia_indexed_encoded,2.649162
1,1,num_educ,1.877319
2,2,num_nodegree,1.748738
7,7,num_age3,1.000279
5,5,num_re78,1.000013
9,9,num_educ_re74,1.000005
3,3,num_re74,0.99996
4,4,num_re75,0.999796


In [8]:
# Coefficients (Linear Regression)
linearr = LinearRegression(labelCol='re78', featuresCol='features').fit(df)
df_fi_coef = extract_features_score(model=linearr, dfs=df, features_col='features')
df_fi_coef

                                                                                

Unnamed: 0,feat_index,feature,coefficients
5,5,num_re78,1.0
2,2,num_nodegree,2.191636e-13
11,11,num_u75,1.662792e-13
12,12,cat_etnia_indexed_encoded,3.73119e-14
0,0,num_age,1.082657e-14
6,6,num_age2,1.492211e-15
9,9,num_educ_re74,2.518182e-18
7,7,num_age3,-1.8940460000000003e-17
3,3,num_re74,-2.6085670000000002e-17
4,4,num_re75,-1.187336e-16
