In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [37]:
spark = SparkSession.builder.appName("SparkSalesDataprep").getOrCreate()

# Data Preparation

In [4]:
schema = StructType([StructField("Order ID", StringType(), True),
                    StructField("Product", StringType(), True),
                    StructField("Quantity Ordered", StringType(), True),
                    StructField("Price Each", StringType(), True),
                    StructField("Order Date", StringType(), True),
                    StructField("Purchase Address", StringType(), True)
                    ])

In [5]:
data = "./data/salesdata/*.csv"

In [6]:
order_df = spark.read.format("csv").option("header", True).schema(schema).load(data)

In [7]:
order_df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: string (nullable = true)
 |-- Price Each: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [8]:
order_df.show(10, False)

+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+
|Order ID|Product                   |Quantity Ordered|Price Each|Order Date    |Purchase Address                         |
+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+
|295665  |Macbook Pro Laptop        |1               |1700      |12/30/19 00:01|136 Church St, New York City, NY 10001   |
|295666  |LG Washing Machine        |1               |600.0     |12/29/19 07:03|562 2nd St, New York City, NY 10001      |
|295667  |USB-C Charging Cable      |1               |11.95     |12/12/19 18:21|277 Main St, New York City, NY 10001     |
|295668  |27in FHD Monitor          |1               |149.99    |12/22/19 15:13|410 6th St, San Francisco, CA 94016      |
|295669  |USB-C Charging Cable      |1               |11.95     |12/18/19 12:38|43 Hill St, Atlanta, GA 30301            |
|295670  |AA Bat

# Project - Cleaning Data on Sales Dataset 

In [9]:
from pyspark.sql.functions import col

In [10]:
order_df.filter(col("Order ID").isNull() == True).show()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|            null|      null|      null|            null|
|    null|   null|       

In [11]:
order_df = order_df.na.drop('any')

In [12]:
order_df.describe("Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address").show()

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|            186305|      186305|            186305|            186305|        186305|              186305|
|   mean| 230417.5693788653|        null|1.1243828986286637|184.39973476747707|          null|                null|
| stddev|51512.737109995265|        null|0.4427926240286704| 332.7313298843439|          null|                null|
|    min|            141234|20in Monitor|                 1|            109.99|01/01/19 03:07|1 11th St, Atlant...|
|    max|          Order ID|      iPhone|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------

In [13]:
order_df.filter(col("Order ID")=="Order ID").show()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
|Order ID|Product|Quantit

In [14]:
order_df_filter = order_df.filter(col("Order ID")!="Order ID")

In [15]:
order_df_filter.show(truncate=False)

+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+
|Order ID|Product                   |Quantity Ordered|Price Each|Order Date    |Purchase Address                         |
+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+
|295665  |Macbook Pro Laptop        |1               |1700      |12/30/19 00:01|136 Church St, New York City, NY 10001   |
|295666  |LG Washing Machine        |1               |600.0     |12/29/19 07:03|562 2nd St, New York City, NY 10001      |
|295667  |USB-C Charging Cable      |1               |11.95     |12/12/19 18:21|277 Main St, New York City, NY 10001     |
|295668  |27in FHD Monitor          |1               |149.99    |12/22/19 15:13|410 6th St, San Francisco, CA 94016      |
|295669  |USB-C Charging Cable      |1               |11.95     |12/18/19 12:38|43 Hill St, Atlanta, GA 30301            |
|295670  |AA Bat

In [16]:
from pyspark.sql.functions import split

In [17]:
order_df_new_city = order_df_filter.select('*', split(col("Purchase Address"), ',').getItem(1).alias("city") )

order_df_new_city.show(truncate=False)

+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+--------------+
|Order ID|Product                   |Quantity Ordered|Price Each|Order Date    |Purchase Address                         |city          |
+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+--------------+
|295665  |Macbook Pro Laptop        |1               |1700      |12/30/19 00:01|136 Church St, New York City, NY 10001   | New York City|
|295666  |LG Washing Machine        |1               |600.0     |12/29/19 07:03|562 2nd St, New York City, NY 10001      | New York City|
|295667  |USB-C Charging Cable      |1               |11.95     |12/12/19 18:21|277 Main St, New York City, NY 10001     | New York City|
|295668  |27in FHD Monitor          |1               |149.99    |12/22/19 15:13|410 6th St, San Francisco, CA 94016      | San Francisco|
|295669  |USB-C Charging Cable    

In [18]:
order_df_new_city_state = order_df_new_city.select('*', split(split(col("Purchase Address"), ',').getItem(2) , ' ').getItem(1).alias("state"))

order_df_new_city_state.show()

+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          city|state|
+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...| New York City|   NY|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...| New York City|   NY|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...| New York City|   NY|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...| San Francisco|   CA|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|       Atlanta|   GA|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31

## Applying cast for OrderID Quantity and Price column

In [19]:
order_df_new_city_state_cast_1 = (order_df_new_city_state
                                  .select('*', col("Order ID").cast('int')
                                  .alias("OrderID"), col("Quantity Ordered").cast('int').alias("Quantity"), col("Price Each").cast('float').alias("Price")))
order_df_new_city_state_cast_1.show(truncate=False)


+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+--------------+-----+-------+--------+------+
|Order ID|Product                   |Quantity Ordered|Price Each|Order Date    |Purchase Address                         |city          |state|OrderID|Quantity|Price |
+--------+--------------------------+----------------+----------+--------------+-----------------------------------------+--------------+-----+-------+--------+------+
|295665  |Macbook Pro Laptop        |1               |1700      |12/30/19 00:01|136 Church St, New York City, NY 10001   | New York City|NY   |295665 |1       |1700.0|
|295666  |LG Washing Machine        |1               |600.0     |12/29/19 07:03|562 2nd St, New York City, NY 10001      | New York City|NY   |295666 |1       |600.0 |
|295667  |USB-C Charging Cable      |1               |11.95     |12/12/19 18:21|277 Main St, New York City, NY 10001     | New York City|NY   |295667 |1       |

In [20]:
order_df_new_city_state_cast_1.describe().show()


+-------+------------------+------------+------------------+------------------+--------------+--------------------+--------+------+------------------+------------------+------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|    city| state|           OrderID|          Quantity|             Price|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+--------+------+------------------+------------------+------------------+
|  count|            185950|      185950|            185950|            185950|        185950|              185950|  185950|185950|            185950|            185950|            185950|
|   mean| 230417.5693788653|        null|1.1243828986286637|184.39973476747707|          null|                null|    null|  null| 230417.5693788653|1.1243828986286637| 184.3997338440329|
| stddev|51512.737109995265|        null|0.442792624028

In [21]:
order_df_new_city_state_cast_1

DataFrame[Order ID: string, Product: string, Quantity Ordered: string, Price Each: string, Order Date: string, Purchase Address: string, city: string, state: string, OrderID: int, Quantity: int, Price: float]

## To Timestamp 

In [22]:
from pyspark.sql.functions import unix_timestamp, to_timestamp
from pyspark.sql.types import TimestampType

In [23]:
order_df_new_city_state_cast_1_time_test = order_df_new_city_state_cast_1.withColumn("Date", unix_timestamp("Order Date", "MM/dd/yy HH:mm:ss").cast(TimestampType()))
order_df_new_city_state_cast_1_time_test.show()

+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+-------+--------+------+----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          city|state|OrderID|Quantity| Price|Date|
+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+-------+--------+------+----+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...| New York City|   NY| 295665|       1|1700.0|null|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...| New York City|   NY| 295666|       1| 600.0|null|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...| New York City|   NY| 295667|       1| 11.95|null|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...| San Francisco|   CA| 295668|

## trying diff method

In [25]:
order_df_new_city_state_cast_1_time1 = order_df_new_city_state_cast_1.withColumn("Date", to_timestamp(col("Order Date"), 'MM/dd/yy HH:mm'))

In [26]:
order_df_new_city_state_cast_1_time1.show()

+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          city|state|OrderID|Quantity| Price|               Date|
+--------+--------------------+----------------+----------+--------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...| New York City|   NY| 295665|       1|1700.0|2019-12-30 00:01:00|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...| New York City|   NY| 295666|       1| 600.0|2019-12-29 07:03:00|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...| New York City|   NY| 295667|       1| 11.95|2019-12-12 18:21:00|
|  295668|    27in FHD Monitor|   

In [27]:
order_df_new_city_state_cast_1_time1.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: string (nullable = true)
 |-- Price Each: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Date: timestamp (nullable = true)



In [28]:
order_df_new_city_state_cast_1_time1 = order_df_new_city_state_cast_1_time1.withColumnRenamed("Purchase Address", "StoreAddress").drop("Order ID").drop("Quantity Ordered").drop("Price Each").drop("Order Date")

In [29]:
order_df_new_city_state_cast_1_time1.show()

+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|             Product|        StoreAddress|          city|state|OrderID|Quantity| Price|               Date|
+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+
|  Macbook Pro Laptop|136 Church St, Ne...| New York City|   NY| 295665|       1|1700.0|2019-12-30 00:01:00|
|  LG Washing Machine|562 2nd St, New Y...| New York City|   NY| 295666|       1| 600.0|2019-12-29 07:03:00|
|USB-C Charging Cable|277 Main St, New ...| New York City|   NY| 295667|       1| 11.95|2019-12-12 18:21:00|
|    27in FHD Monitor|410 6th St, San F...| San Francisco|   CA| 295668|       1|149.99|2019-12-22 15:13:00|
|USB-C Charging Cable|43 Hill St, Atlan...|       Atlanta|   GA| 295669|       1| 11.95|2019-12-18 12:38:00|
|AA Batteries (4-p...|200 Jefferson St,...| New York City|   NY| 295670|       1|  3.84|2019-12-31 22:58:00|
|USB-C Charging Cab

In [30]:
from pyspark.sql.functions import year, month, col

In [31]:
order_df_new_city_state_cast_1_time1 = order_df_new_city_state_cast_1_time1.withColumn("ReportYear", year(col("Date"))).withColumn("Month", month(col("Date")) )

In [32]:
order_df_new_city_state_cast_1_time1.show()

+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----------+-----+
|             Product|        StoreAddress|          city|state|OrderID|Quantity| Price|               Date|ReportYear|Month|
+--------------------+--------------------+--------------+-----+-------+--------+------+-------------------+----------+-----+
|  Macbook Pro Laptop|136 Church St, Ne...| New York City|   NY| 295665|       1|1700.0|2019-12-30 00:01:00|      2019|   12|
|  LG Washing Machine|562 2nd St, New Y...| New York City|   NY| 295666|       1| 600.0|2019-12-29 07:03:00|      2019|   12|
|USB-C Charging Cable|277 Main St, New ...| New York City|   NY| 295667|       1| 11.95|2019-12-12 18:21:00|      2019|   12|
|    27in FHD Monitor|410 6th St, San F...| San Francisco|   CA| 295668|       1|149.99|2019-12-22 15:13:00|      2019|   12|
|USB-C Charging Cable|43 Hill St, Atlan...|       Atlanta|   GA| 295669|       1| 11.95|2019-12-18 12:38:00|      2019

### Write final dataframe to Parquet

Rearrange data into order

In [34]:
order_df_final = order_df_new_city_state_cast_1_time1.select("OrderID", "Product", "Quantity", "Price", "Date", "StoreAddress", "city", "ReportYear", "Month")


In [35]:
order_df_final.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- StoreAddress: string (nullable = true)
 |-- city: string (nullable = true)
 |-- ReportYear: integer (nullable = true)
 |-- Month: integer (nullable = true)



In [36]:
output_path = './data/output/sales'
order_df_final.write.mode("overwrite").partitionBy("ReportYear", "Month").parquet(output_path)