# Evaluation of Classification Models


## Session Setup

In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F

from pyspark_ds_toolbox.ml.classification import eval as cl_ev 



In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/18 15:53:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/18 15:53:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/18 15:53:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Binary Classifiers

### `binary_classificator_evaluator()`

In [15]:
df = pd.DataFrame({
    'target': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
    'predicted': [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
}).reset_index()
df = spark.createDataFrame(df)
df.show(5)

+-----+------+---------+
|index|target|predicted|
+-----+------+---------+
|    0|   1.0|      1.0|
|    1|   1.0|      1.0|
|    2|   1.0|      0.0|
|    3|   1.0|      0.0|
|    4|   1.0|      0.0|
+-----+------+---------+
only showing top 5 rows



In [6]:
out = cl_ev.binary_classificator_evaluator(
    dfs_prediction=df,
    col_target='target',
    col_prediction='predicted'
)
out.keys()

dict_keys(['confusion_matrix', 'accuracy', 'f1', 'precision', 'recall', 'aucroc', 'aucpr'])

In [7]:
out['confusion_matrix']

Unnamed: 0,target,predicted,count
0,1.0,1.0,2
1,0.0,1.0,1
2,1.0,0.0,4
3,0.0,0.0,3


In [8]:
out['accuracy']

0.5

In [9]:
out['f1']

0.4444444444444444

In [10]:
out['precision']

0.6666666666666666

In [11]:
out['recall']

0.3333333333333333

In [12]:
out['aucroc']

0.5416666666666666

In [13]:
out['aucpr']

0.6444444444444445

### `binary_classifier_decile_analysis()`

In [16]:
df = spark.createDataFrame(pd.read_csv('../tests/data/df_test_binary_classifier_decile_analysis.csv'))
df.show(5)

+---------+------------+----------+
| id_conta|target_value|        p1|
+---------+------------+----------+
|484034448|           0|0.54177165|
|418564110|           0| 0.7748305|
|464339157|           0|0.22917716|
|309485972|           0|0.60101485|
|154315670|           0|0.48498958|
+---------+------------+----------+
only showing top 5 rows



In [17]:
decile_table = cl_ev.binary_classifier_decile_analysis(
        dfs=df,
        col_id='id_conta',
        col_target='target_value',
        col_probability='p1'
    )

In [19]:
decile_table.toPandas()

                                                                                

Unnamed: 0,percentile,min_prob,max_prob,avg_prob,count_id,non_events,events,cum_non_events,cum_events,nonevent_rate,event_rate,cum_noneventrate,cum_eventrate,precision_at_percentile,ks
0,1,0.692887,0.859826,0.753371,40,37,3,37,3,0.093671,0.25,0.093671,0.25,0.075,0.1563291
1,2,0.610804,0.690563,0.655217,40,39,1,76,4,0.098734,0.083333,0.192405,0.333333,0.05,0.1409283
2,3,0.501604,0.602958,0.541839,40,37,3,113,7,0.093671,0.25,0.286076,0.583333,0.058333,0.2972574
3,4,0.447533,0.501213,0.476912,41,41,0,154,7,0.103797,0.0,0.389873,0.583333,0.043478,0.1934599
4,5,0.426632,0.447533,0.445726,41,40,1,194,8,0.101266,0.083333,0.491139,0.666667,0.039604,0.1755274
5,6,0.31692,0.42648,0.376,41,40,1,234,9,0.101266,0.083333,0.592405,0.75,0.037037,0.1575949
6,7,0.291473,0.31692,0.297885,41,40,1,274,10,0.101266,0.083333,0.693671,0.833333,0.035211,0.1396624
7,8,0.258163,0.291473,0.280521,41,41,0,315,10,0.103797,0.0,0.797468,0.833333,0.030769,0.03586498
8,9,0.196785,0.256523,0.220618,41,40,1,355,11,0.101266,0.083333,0.898734,0.916667,0.030055,0.01793249
9,10,0.007746,0.194345,0.146185,41,40,1,395,12,0.101266,0.083333,1.0,1.0,0.029484,1.110223e-16
