# 3.2 - Feature de Pagamento

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/06 11:07:57 WARN Utils: Your hostname, MacBook-Air-de-Vitor.local, resolves to a loopback address: 127.0.0.1; using 192.168.3.49 instead (on interface en0)
26/02/06 11:07:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/06 11:07:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/06 11:07:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/06 11:07:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/06 11:07:58 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
# Vamos carredar o payment csv

path = "data/raw/olist_order_payments_dataset.csv"
payment = spark.read.csv(
    path,
    header=True, 
    inferSchema=True
)

payment.show(5)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 5 rows


In [3]:
# Vamos criar log do valor pago e log da parcela
# Isso pode ajudar dependendo do modelo utilizado

logic = F.log(10.0, F.col("payment_value"))

(
    payment
    .withColumn("payment_log_value", logic)
).show(5)

+--------------------+------------------+------------+--------------------+-------------+------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value| payment_log_value|
+--------------------+------------------+------------+--------------------+-------------+------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|1.9970804354717304|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|1.3872118003137304|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|1.8176314671905152|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|2.0325381792600066|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45| 2.108734108602365|
+--------------------+------------------+------------+--------------------+-------------+------------------+
only showing top 5 

In [4]:
# Sabemos que não podemos dividir por zero
# Será que temos esse problema no nosso dataset?
# Vamos filtrar o dataset para mostrar só installments = 0

(
    payment
    .where(F.col('payment_installments') == 0)
).show()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|744bade1fcf9ff3f3...|                 2| credit_card|                   0|        58.69|
|1a57108394169c0b4...|                 2| credit_card|                   0|       129.94|
+--------------------+------------------+------------+--------------------+-------------+



In [5]:
# A solução que vamos adotar aqui é transformar 0 em 1

logic = (
    F.when(F.col('payment_installments') == 0, 1)
    .otherwise(F.col('payment_installments'))
)

(
    payment
    .withColumn('new_payment_installments', logic)
).show(5)

+--------------------+------------------+------------+--------------------+-------------+------------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|new_payment_installments|
+--------------------+------------------+------------+--------------------+-------------+------------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|                       8|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|                       1|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|                       1|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|                       8|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|                       2|
+--------------------+------------------+------------+--------------------+-----

In [6]:
# Unificando tudo em uma função

def create_payment_features(df):
    log_total_value = F.log(10.0, F.col("payment_value"))
    n_installment = (
        F.when(F.col('payment_installments') == 0, 1)
        .otherwise(F.col('payment_installments'))
    )

    installment_value = F.col("payment_value")/n_installment
    log_installment_value = F.log(10.0, installment_value)

    feature_df = (
        df
        .withColumn("payment_log_total_value", log_total_value)
        .withColumn("payment_log_installment_value", log_installment_value)
    )

    return feature_df

payment.transform(create_payment_features).show(5)

+--------------------+------------------+------------+--------------------+-------------+-----------------------+-----------------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|payment_log_total_value|payment_log_installment_value|
+--------------------+------------------+------------+--------------------+-------------+-----------------------+-----------------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|     1.9970804354717304|            1.093990448479787|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|     1.3872118003137304|           1.3872118003137304|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|     1.8176314671905152|           1.8176314671905152|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|     2.0325381792600066|            1.129448192

## Salvando 

In [7]:
path = "data/processed/feature_payment"

(
    payment
    .transform(create_payment_features)
    .write
    .mode("overwrite")
    .parquet(path)
)

                                                                                

In [8]:
!tree data/processed/feature_payment

[1;36mdata/processed/feature_payment[0m
├── _SUCCESS
├── part-00000-b6ebaff9-1c60-44aa-ae89-be60015a618f-c000.snappy.parquet
└── part-00001-b6ebaff9-1c60-44aa-ae89-be60015a618f-c000.snappy.parquet

1 directory, 3 files
