In [0]:
%run "../includes/configurations"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.functions import to_date

In [0]:
orders_schema = StructType(fields=[StructField("order_id", StringType(), False),
                                     StructField("customer_id", StringType(), True),
                                     StructField("order_status", StringType(), True),
                                     StructField("order_purchase_timestamp", TimestampType(), True),
                                     StructField("order_approved_at", TimestampType(), True),
                                     StructField("order_delivered_carrier_date", TimestampType(), True),
                                     StructField("order_delivered_customer_date", TimestampType(), True),
                                     StructField("order_estimated_delivery_date", DateType(), True)
                       ])

In [0]:
orders_df = spark.read.option("header", True)\
.schema(orders_schema)\
.csv(f"{raw_folder_path}/olist_orders_dataset.csv")

In [0]:
orders_df.printSchema()

In [0]:
orders_df.count()    #VALID

In [0]:
#from pyspark.sql.functions import date_format, col, to_timestamp

In [0]:
display(orders_df.take(10))

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33.000+0000,2017-10-02T11:07:15.000+0000,2017-10-04T19:55:00.000+0000,2017-10-10T21:25:13.000+0000,2017-10-18
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37.000+0000,2018-07-26T03:24:27.000+0000,2018-07-26T14:31:00.000+0000,2018-08-07T15:27:45.000+0000,2018-08-13
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49.000+0000,2018-08-08T08:55:23.000+0000,2018-08-08T13:50:00.000+0000,2018-08-17T18:06:29.000+0000,2018-09-04
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06.000+0000,2017-11-18T19:45:59.000+0000,2017-11-22T13:39:59.000+0000,2017-12-02T00:28:42.000+0000,2017-12-15
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39.000+0000,2018-02-13T22:20:29.000+0000,2018-02-14T19:46:34.000+0000,2018-02-16T18:17:02.000+0000,2018-02-26
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09T21:57:05.000+0000,2017-07-09T22:10:13.000+0000,2017-07-11T14:58:04.000+0000,2017-07-26T10:57:55.000+0000,2017-08-01
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11T12:22:08.000+0000,2017-04-13T13:25:17.000+0000,,,2017-05-09
6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16T13:10:30.000+0000,2017-05-16T13:22:11.000+0000,2017-05-22T10:07:46.000+0000,2017-05-26T12:55:51.000+0000,2017-06-07
76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23T18:29:09.000+0000,2017-01-25T02:50:47.000+0000,2017-01-26T14:16:31.000+0000,2017-02-02T14:08:10.000+0000,2017-03-06
e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29T11:55:02.000+0000,2017-07-29T12:05:32.000+0000,2017-08-10T19:45:24.000+0000,2017-08-16T17:14:30.000+0000,2017-08-23


#### SELECTING ONLY THE REQUIRED COLUMNS

In [0]:
final_orders_df = orders_df.select('order_id', 'customer_id', 'order_status',
                                   'order_purchase_timestamp', 'order_delivered_customer_date')\
.withColumn("order_date", to_date(orders_df.order_purchase_timestamp))\
.withColumn("order_delivery_date", to_date(orders_df.order_delivered_customer_date))\
.drop("order_purchase_timestamp").drop("order_delivered_customer_date")

In [0]:
final_orders_df.count()

In [0]:
display(final_orders_df.take(7))

order_id,customer_id,order_status,order_date,order_delivery_date
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02,2017-10-10
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24,2018-08-07
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08,2018-08-17
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18,2017-12-02
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13,2018-02-16
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09,2017-07-26
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11,


In [0]:
final_orders_df.write.mode('overwrite').parquet(f'{processed_folder_path}/orders')

In [0]:
final_orders_df.coalesce(1).write.mode('overwrite').option("header", "true").csv(f'{processed_folder_path}/csv/orders_processed.csv')