# 3.1 - Criando o nosso Target 

## Imports

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/05 21:18:54 WARN Utils: Your hostname, MacBook-Air-de-Vitor.local, resolves to a loopback address: 127.0.0.1; using 192.168.3.49 instead (on interface en0)
26/02/05 21:18:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/05 21:18:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

In [2]:
# Carregando o dataset criado na ultima aula

path = 'data/processed/olist_order_reviews_dataset'

df = spark.read.parquet(path)

df.show(5)

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|e22aad953e113cdcc...|8b2c286fa36b36c81...|           4|                NULL|               Nota 10| 2017-09-19 00:00:00|    2017-09-20 01:07:14|
|5ddb195ab2206a456...|18937b40506fdcbd3...|           4|                NULL|                  NULL| 2017-05-30 00:00:00|    2017-07-06 18:36:02|
|4d3c61768eb47216e...|cb4a79c1e6c9ae443...|           1|                NULL|  Absurdo! Venderam...| 2018-01-05 00:00:00|    2018-01-15 11:04:49|
|5ef9614ed02a28935...|2e1934467537a71d1...|           3|                NULL|                  NULL| 2018-08-01 00:00:00|   

## Select e Describe

In [3]:
# Vamos Selecionar a coluna de review

df.select(
    "review_score"
).show(5, truncate=False)

+------------+
|review_score|
+------------+
|4           |
|4           |
|1           |
|3           |
|5           |
+------------+
only showing top 5 rows


In [4]:
# Vamos fazer calcular algunas estatisticas dessa coluna

df.select("review_score").describe().show()

+-------+-----------------+
|summary|     review_score|
+-------+-----------------+
|  count|            99225|
|   mean|4.086379440665155|
| stddev|1.347634781943274|
|    min|                0|
|    max|                5|
+-------+-----------------+



## Criando o nosso target

In [5]:
# Uma pratica comum é importar o modulo functions como F
# Evitando sobrescrever functions do python como min, max, ...
from pyspark.sql import functions as F

In [6]:
# Criando o target binario (1 ou 0)

(
    df
    .select("review_id", "review_score")
    .withColumn("bom_review", F.col("review_score") >= 4)
    .withColumn("bom_review_int", F.col("bom_review").cast("int"))
).show(5)

+--------------------+------------+----------+--------------+
|           review_id|review_score|bom_review|bom_review_int|
+--------------------+------------+----------+--------------+
|e22aad953e113cdcc...|           4|      true|             1|
|5ddb195ab2206a456...|           4|      true|             1|
|4d3c61768eb47216e...|           1|     false|             0|
|5ef9614ed02a28935...|           3|     false|             0|
|f62709ef754361d3e...|           5|      true|             1|
+--------------------+------------+----------+--------------+
only showing top 5 rows


In [7]:
# Podemos fazer isso de uma forma mais parecida com o SQL

df.select(
    F.col("review_id"),
    F.col("review_score"),
    (F.col("review_score") >= 4).cast("int").alias("target")
).show(5)

+--------------------+------------+------+
|           review_id|review_score|target|
+--------------------+------------+------+
|e22aad953e113cdcc...|           4|     1|
|5ddb195ab2206a456...|           4|     1|
|4d3c61768eb47216e...|           1|     0|
|5ef9614ed02a28935...|           3|     0|
|f62709ef754361d3e...|           5|     1|
+--------------------+------------+------+
only showing top 5 rows


In [None]:
path = "data/processed/target"
target_logic = (F.col("review_score") >= 4).cast("int").alias("target")

target_df = df.select(
    F.col("review_id"),
    F.col("order_id"),
    F.col("review_score"),
    target_logic
).write.mode("overwrite").parquet(path)

In [9]:
!tree data/processed/target

[1;36mdata/processed/target[0m
├── _SUCCESS
├── part-00000-baab187e-0823-4328-86d9-495b99f71c2e-c000.snappy.parquet
├── part-00001-baab187e-0823-4328-86d9-495b99f71c2e-c000.snappy.parquet
├── part-00002-baab187e-0823-4328-86d9-495b99f71c2e-c000.snappy.parquet
└── part-00003-baab187e-0823-4328-86d9-495b99f71c2e-c000.snappy.parquet

1 directory, 5 files
