# Data Preparation


## Session Setup

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

from pyspark_ds_toolbox.ml import data_prep as ml_dp



In [3]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/18 16:23:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/18 16:23:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/18 16:23:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/01/18 16:23:33 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/01/18 16:23:33 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


## Class Weights

### `binary_classifier_weights()`

In [4]:
df = spark.createDataFrame(pd.read_csv('../tests/data/df_test_binary_classifier_decile_analysis.csv'))
df.show(5)

                                                                                

+---------+------------+----------+
| id_conta|target_value|        p1|
+---------+------------+----------+
|484034448|           0|0.54177165|
|418564110|           0| 0.7748305|
|464339157|           0|0.22917716|
|309485972|           0|0.60101485|
|154315670|           0|0.48498958|
+---------+------------+----------+
only showing top 5 rows



In [5]:
dfs_weights = ml_dp.class_weights.binary_classifier_weights(dfs=df, col_target='target_value')
dfs_weights.show(5)

+---------+------------+----------+-------------------+
| id_conta|target_value|        p1|weight_target_value|
+---------+------------+----------+-------------------+
|484034448|           0|0.54177165| 0.5151898734177215|
|418564110|           0| 0.7748305| 0.5151898734177215|
|464339157|           0|0.22917716| 0.5151898734177215|
|309485972|           0|0.60101485| 0.5151898734177215|
|154315670|           0|0.48498958| 0.5151898734177215|
+---------+------------+----------+-------------------+
only showing top 5 rows



## Features Vector

### `get_features_vector()`

In [6]:
df = pd.DataFrame({
    'index':[1, 2, 3, 4],
    'num1': [0.1, 0.2, 0.3, 0.4],
    'num2': [0.4, 0.3, 0.2, 0.1],
    'cat1': ['a', 'b', 'a', 'b'],
    'cat2': ['c', 'd', 'c', 'd']
    })
df = spark.createDataFrame(df)

df.show()

+-----+----+----+----+----+
|index|num1|num2|cat1|cat2|
+-----+----+----+----+----+
|    1| 0.1| 0.4|   a|   c|
|    2| 0.2| 0.3|   b|   d|
|    3| 0.3| 0.2|   a|   c|
|    4| 0.4| 0.1|   b|   d|
+-----+----+----+----+----+



In [7]:
d = ml_dp.features_vector.get_features_vector(
    df=df,
    num_features=['num1', 'num2'],
    cat_features=['cat1', 'cat2']
)
d.show()

                                                                                

+-----+----+----+----+----+-----------------+
|index|num1|num2|cat1|cat2|         features|
+-----+----+----+----+----+-----------------+
|    1| 0.1| 0.4|   a|   c|[0.1,0.4,0.0,0.0]|
|    2| 0.2| 0.3|   b|   d|[0.2,0.3,1.0,1.0]|
|    3| 0.3| 0.2|   a|   c|[0.3,0.2,0.0,0.0]|
|    4| 0.4| 0.1|   b|   d|[0.4,0.1,1.0,1.0]|
+-----+----+----+----+----+-----------------+

