# Binary Base Line Classificatores


## Session Setup

In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F

from pyspark_ds_toolbox.ml.classification.baseline_classifiers import baseline_binary_classfiers



In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/06 16:20:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/06 16:20:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## lendo o dataset base

In [3]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df)\
    .withColumn('age2', F.col('age')**2)\
    .withColumn('age3', F.col('age')**3)\
    .withColumn('educ2', F.col('educ')**2)\
    .withColumn('educ_re74', F.col('educ')*F.col('re74'))\
    .withColumn('u74', F.when(F.col('re74')==0, 1).otherwise(0))\
    .withColumn('u75', F.when(F.col('re75')==0, 1).otherwise(0))\
    .withColumn('etnia', F.expr('case when black=1 then "black" when hisp=1 then "hisp" when marr=1 then "marr" else "other" end'))\
    .drop('black', 'hisp', 'marr')



dfs_train, dfs_test = df.randomSplit([0.8, 0.2], seed=4)
dfs_test.show(5)

                                                                                

+-----+--------------------+-----+----+----+--------+--------+--------+--------+------+--------+-----+---------+---+---+-----+
|index|             data_id|treat| age|educ|nodegree|    re74|    re75|    re78|  age2|    age3|educ2|educ_re74|u74|u75|etnia|
+-----+--------------------+-----+----+----+--------+--------+--------+--------+------+--------+-----+---------+---+---+-----+
|    0|                CPS1|  0.0|45.0|11.0|     1.0|21516.67|25243.55|25564.67|2025.0| 91125.0|121.0|236683.38|  0|  0| marr|
|    3|Dehejia-Wahba Sample|  1.0|27.0|11.0|     1.0|     0.0|     0.0|7506.146| 729.0| 19683.0|121.0|      0.0|  1|  1|black|
|    7|                CPS1|  0.0|18.0|11.0|     1.0|1144.212|3620.032|15739.27| 324.0|  5832.0|121.0|12586.332|  0|  0|other|
|   10|                CPS1|  0.0|34.0|14.0|     0.0|25862.32|23746.84|25564.67|1156.0| 39304.0|196.0| 362072.5|  0|  0| marr|
|   12|                CPS1|  0.0|53.0|10.0|     1.0|25862.32|25243.55|25564.67|2809.0|148877.0|100.0| 258623.2

## Fitting the Baseline Classifiers

In [4]:
base_line_out = baseline_binary_classfiers(
    dfs=dfs_train,
    id_col='index',
    target_col='treat',
    num_features=['age', 'educ', 'nodegree', 're74', 're75', 're78', 'age2', 'age3', 'educ2', 'educ_re74', 'u74', 'u75'],
    cat_features=['data_id', 'etnia'],
    dfs_test=dfs_test,
    weight_on_target=True,
    log_mlflow_run=False,
    artifact_stage_path = None
)

Computing Features Vector


                                                                                

Computing Class Weights


                                                                                

Instanciating Classifiers


                                                                                

Predicting on Test Data and Evaluating


100%|██████████| 4/4 [00:19<00:00,  4.95s/it]                                   


In [6]:
base_line_out.keys()

dict_keys(['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'GBTClassifier'])

In [None]:
base_line_out['LogisticRegression'].keys()

dict_keys(['model', 'metrics', 'decile_metrics'])

In [7]:
base_line_out['LogisticRegression']['metrics']

{'confusion_matrix':    treat  prediction  count
 0    0.0         0.0   3240
 1    1.0         1.0     33
 2    0.0         1.0     55,
 'accuracy': 0.9834735576923077,
 'f1': 0.5454545454545454,
 'precision': 0.375,
 'recall': 1.0,
 'aucroc': 0.9916540212443096,
 'aucpr': 0.375}