# Wrangling Module: Reshape Helpers


## Session Setup

In [2]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F

import pyspark_ds_toolbox.wrangling.reshape as wr_rs 

In [3]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/21 20:46:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/21 20:46:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/21 20:46:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## `pivot_long()`

In [4]:
df = pd.DataFrame({
    'serie_x': [1, 2, 3],
    'serie_y': [4, 5, 6],
    'serie_w': [7, 8, 9]
}).reset_index()
df = spark.createDataFrame(df)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+-------+-------+-------+
|index|serie_x|serie_y|serie_w|
+-----+-------+-------+-------+
|    0|      1|      4|      7|
|    1|      2|      5|      8|
|    2|      3|      6|      9|
+-----+-------+-------+-------+



                                                                                

In [7]:
wr_rs.pivot_long(
    df,
    key_column_name='index',
    key_columns=['index'],
    value_column_name='serie',
    value_columns=['serie_x', 'serie_y', 'serie_w']
).show()

+-----+-------+-----+
|index|  index|serie|
+-----+-------+-----+
|    0|serie_x|    1|
|    0|serie_y|    4|
|    0|serie_w|    7|
|    1|serie_x|    2|
|    1|serie_y|    5|
|    1|serie_w|    8|
|    2|serie_x|    3|
|    2|serie_y|    6|
|    2|serie_w|    9|
+-----+-------+-----+



## `with_start_week()`

In [9]:
df = pd.DataFrame({
    'date': ['2022-02-24', '2002-09-07']
}).reset_index()
df = spark.createDataFrame(df)\
    .withColumn('date', F.to_date('date'))
df.show()

+-----+----------+
|index|      date|
+-----+----------+
|    0|2022-02-24|
|    1|2002-09-07|
+-----+----------+



In [10]:
wr_rs.with_start_week(df=df, date_col='date', start_day='sunday').show()

+-----+----------+----------+
|index|      date|      week|
+-----+----------+----------+
|    0|2022-02-24|2022-02-20|
|    1|2002-09-07|2002-09-01|
+-----+----------+----------+

