# 3.3 - Feature sobre a entrega

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/06 10:15:11 WARN Utils: Your hostname, MacBook-Air-de-Vitor.local, resolves to a loopback address: 127.0.0.1; using 192.168.3.49 instead (on interface en0)
26/02/06 10:15:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/06 10:15:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/06 10:15:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Carregando CSV

In [2]:
path = "data/raw/olist_orders_dataset.csv"
df = spark.read.csv(path, header=True, inferSchema=True)

df.show(5)
df.printSchema()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

## Renomeando
As colunas tem um nome muito longo, vamos simplificar um pouco

In [3]:
# usando withColumnRenamed
df.withColumnRenamed("order_status", "status").printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [4]:
# Temos que aplicar o withColumnRenamed em 6 colunas, vamos automatizar
# Removendo order_, menos do order_id
# Vamos fazer isso com um loop

columns = df.columns
n = len(columns)

for i in range(n):
    if columns[i] == 'order_id':
        columns[i] = F.col(columns[i])
    elif 'order_' in columns[i]:
        columns[i] = F.col(columns[i]).alias(columns[i].replace('order_', ''))
    else:
        columns[i] = F.col(columns[i])

df = df.select(columns)
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- status: string (nullable = true)
 |-- purchase_timestamp: timestamp (nullable = true)
 |-- approved_at: timestamp (nullable = true)
 |-- delivered_carrier_date: timestamp (nullable = true)
 |-- delivered_customer_date: timestamp (nullable = true)
 |-- estimated_delivery_date: timestamp (nullable = true)



## Criando Variáveis

Queremos criar as seguintes variáveis
1. Mês da compra
2. Semana da compra
3. Dia da semana da compra
4. Dias entre a compra e a entrega

Vocês podem explorar outras variáveis

In [5]:
# Criando as variaveis que extraem informações do dia da compra

date_col = "purchase_timestamp"

(
    df
    .select(
        F.col("order_id"),
        F.col(date_col),
        F.month(date_col).alias("purchase_month"),
        F.weekofyear(date_col).alias("purchase_week"),
        F.dayofweek(date_col).alias("purchase_day_week")
    )
).show(5)

+--------------------+-------------------+--------------+-------------+-----------------+
|            order_id| purchase_timestamp|purchase_month|purchase_week|purchase_day_week|
+--------------------+-------------------+--------------+-------------+-----------------+
|e481f51cbdc54678b...|2017-10-02 10:56:33|            10|           40|                2|
|53cdb2fc8bc7dce0b...|2018-07-24 20:41:37|             7|           30|                3|
|47770eb9100c2d0c4...|2018-08-08 08:38:49|             8|           32|                4|
|949d5b44dbf5de918...|2017-11-18 19:28:06|            11|           46|                7|
|ad21c59c0840e6cb8...|2018-02-13 21:18:39|             2|            7|                3|
+--------------------+-------------------+--------------+-------------+-----------------+
only showing top 5 rows


In [6]:
# Criando a variavel de diff

p_ts = "purchase_timestamp"
d_ts = "delivered_customer_date"

(
    df
    .where(F.col(d_ts).isNotNull())
    .select(
        F.col(p_ts),
        F.col(d_ts),
        F.date_diff(d_ts, p_ts).alias("days_between")
    )
).show(5)

+-------------------+-----------------------+------------+
| purchase_timestamp|delivered_customer_date|days_between|
+-------------------+-----------------------+------------+
|2017-10-02 10:56:33|    2017-10-10 21:25:13|           8|
|2018-07-24 20:41:37|    2018-08-07 15:27:45|          14|
|2018-08-08 08:38:49|    2018-08-17 18:06:29|           9|
|2017-11-18 19:28:06|    2017-12-02 00:28:42|          14|
|2018-02-13 21:18:39|    2018-02-16 18:17:02|           3|
+-------------------+-----------------------+------------+
only showing top 5 rows


## Unificando Tudo e Salvando

In [None]:
path = "data/processed/feature_delivery"
p_ts = "purchase_timestamp"
d_ts = "delivered_customer_date"

(
    df
    .where(F.col(d_ts).isNotNull())
    .select(
        F.col("order_id"),
        F.month(p_ts).alias("purchase_month"),
        F.weekofyear(p_ts).alias("purchase_week"),
        F.dayofweek(p_ts).alias("purchase_day_week"),
        F.date_diff(d_ts, p_ts).alias("days_between")
    )
).write.mode("overwrite").parquet(path)

In [None]:
!tree data/processed/feature_delivery