In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, to_date, to_timestamp, hour, minute, year, month, day
from pyspark.sql.types import StringType, DoubleType, IntegerType, DateType, TimestampType

In [2]:
spark = SparkSession.builder.master("local[*]") \
    .config("spark.executor.instances", "5") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [6]:
df = spark.read.option("Header", True).csv("spark-warehouse/landing/supermarket_sales.csv")

In [4]:
df.columns

['Invoice ID',
 'Branch',
 'City',
 'Customer type',
 'Gender',
 'Product line',
 'Unit price',
 'Quantity',
 'Tax 5%',
 'Total',
 'Date',
 'Time',
 'Payment',
 'cogs',
 'gross margin percentage',
 'gross income',
 'Rating']

In [5]:
df.printSchema()

root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Tax 5%: string (nullable = true)
 |-- Total: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: string (nullable = true)
 |-- gross margin percentage: string (nullable = true)
 |-- gross income: string (nullable = true)
 |-- Rating: string (nullable = true)



In [6]:
df.show(1)

+-----------+------+------+-------------+------+-----------------+----------+--------+-------+--------+--------+-----+-------+------+-----------------------+------------+------+
| Invoice ID|Branch|  City|Customer type|Gender|     Product line|Unit price|Quantity| Tax 5%|   Total|    Date| Time|Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+------+-------------+------+-----------------+----------+--------+-------+--------+--------+-----+-------+------+-----------------------+------------+------+
|750-67-8428|     A|Yangon|       Member|Female|Health and beauty|     74.69|       7|26.1415|548.9715|1/5/2019|13:08|Ewallet|522.83|            4.761904762|     26.1415|   9.1|
+-----------+------+------+-------------+------+-----------------+----------+--------+-------+--------+--------+-----+-------+------+-----------------------+------------+------+
only showing top 1 row



In [7]:
column_rename_and_cast = {
    'Invoice ID': ('invoice_id', StringType()),
    'Branch': ('branch', StringType()),
    'City': ('city', StringType()),
    'Customer type': ('customer_type', StringType()),
    'Gender': ('gender', StringType()),
    'Product line': ('product_line', StringType()),
    'Unit price': ('unit_price', DoubleType()),
    'Quantity': ('quantity', IntegerType()),
    'Tax 5%': ('tax_5_percent', DoubleType()),
    'Total': ('total', DoubleType()),
    'Date': ('date', StringType()), # Tratar data direto pode levar a problemas, melhor usar funções auxiliares
    'Time': ('time', StringType()), # Tratar timetamp direto pode levar a problemas, melhor usar funções auxiliares
    'Payment': ('payment', StringType()),
    'cogs': ('cogs', DoubleType()),
    'gross margin percentage': ('gross_margin_percentage', DoubleType()),
    'gross income': ('gross_income', DoubleType()),
    'Rating': ('rating', DoubleType())
}

In [8]:
for original_name, (new_name, new_type) in column_rename_and_cast.items():
    df = df.withColumnRenamed(original_name, new_name) \
           .withColumn(new_name, col(new_name).cast(new_type))

In [9]:
df.printSchema()

root
 |-- invoice_id: string (nullable = true)
 |-- branch: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- product_line: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_5_percent: double (nullable = true)
 |-- total: double (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross_margin_percentage: double (nullable = true)
 |-- gross_income: double (nullable = true)
 |-- rating: double (nullable = true)



In [10]:
df.explain()

== Physical Plan ==
*(1) Project [Invoice ID#17 AS invoice_id#156, Branch#18 AS branch#192, City#19 AS city#228, Customer type#20 AS customer_type#264, Gender#21 AS gender#300, Product line#22 AS product_line#336, cast(Unit price#23 as double) AS unit_price#372, cast(Quantity#24 as int) AS quantity#408, cast(Tax 5%#25 as double) AS tax_5_percent#444, cast(Total#26 as double) AS total#480, Date#27 AS date#516, Time#28 AS time#552, Payment#29 AS payment#588, cast(cogs#30 as double) AS cogs#624, cast(gross margin percentage#31 as double) AS gross_margin_percentage#660, cast(gross income#32 as double) AS gross_income#696, cast(Rating#33 as double) AS rating#732]
+- FileScan csv [Invoice ID#17,Branch#18,City#19,Customer type#20,Gender#21,Product line#22,Unit price#23,Quantity#24,Tax 5%#25,Total#26,Date#27,Time#28,Payment#29,cogs#30,gross margin percentage#31,gross income#32,Rating#33] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/notebo

In [11]:
df.show()

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+---------+-----+-----------+------+-----------------------+------------+------+
| invoice_id|branch|     city|customer_type|gender|        product_line|unit_price|quantity|tax_5_percent|   total|     date| time|    payment|  cogs|gross_margin_percentage|gross_income|rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+---------+-----+-----------+------+-----------------------+------------+------+
|750-67-8428|     A|   Yangon|       Member|Female|   Health and beauty|     74.69|       7|      26.1415|548.9715| 1/5/2019|13:08|    Ewallet|522.83|            4.761904762|     26.1415|   9.1|
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|         3.82|   80.22| 3/8/2019|10:29|       Cash|  76.4|            4.761904762|        3.82|   9.6|
|631-41-3108|     A|   Ya

In [12]:
df = df.withColumn("year", year(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("month", month(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("day", day(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("hour", hour(to_timestamp(col("time"), 'HH:mm'))) \
        .withColumn("minute", minute(to_timestamp(col("time"), 'HH:mm'))) \
        .drop(col("date")) \
        .drop(col("time"))

In [13]:
df.printSchema()

root
 |-- invoice_id: string (nullable = true)
 |-- branch: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- product_line: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_5_percent: double (nullable = true)
 |-- total: double (nullable = true)
 |-- payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross_margin_percentage: double (nullable = true)
 |-- gross_income: double (nullable = true)
 |-- rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)



In [14]:
df.show()

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+-----------+------+-----------------------+------------+------+----+-----+---+----+------+
| invoice_id|branch|     city|customer_type|gender|        product_line|unit_price|quantity|tax_5_percent|   total|    payment|  cogs|gross_margin_percentage|gross_income|rating|year|month|day|hour|minute|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+-----------+------+-----------------------+------------+------+----+-----+---+----+------+
|750-67-8428|     A|   Yangon|       Member|Female|   Health and beauty|     74.69|       7|      26.1415|548.9715|    Ewallet|522.83|            4.761904762|     26.1415|   9.1|2019|    1|  5|  13|     8|
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|         3.82|   80.22|       Cash|  76.4|            4.761904762|        3.82|   9.

In [15]:
df.write.saveAsTable("supermarket_sales_bronze", format="parquet", mode="overwrite", partitionBy=["year", "month", "day", "hour", "minute"], path="bronze/supermarket_sales")