In [0]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .appName('OListData') \
    .getOrCreate()

In [0]:
customers_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_customers_dataset.csv", header=True, inferSchema=True)

orders_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_orders_dataset.csv", header=True, inferSchema=True)

geolocation_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_geolocation_dataset.csv", header=True, inferSchema=True)

order_items_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_order_items_dataset.csv", header=True, inferSchema=True)

order_payments_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_order_payments_dataset.csv", header=True, inferSchema=True)

order_reviews_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_order_reviews_dataset.csv", header=True, inferSchema=True)

products_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_products_dataset.csv", header=True, inferSchema=True)    

sellers_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/olist_sellers_dataset.csv", header=True        , inferSchema=True)

product_category_name_translation_df = spark.read.csv("/Volumes/workspace/default/data/brazilian_ecommerce/product_category_name_translation.csv", header=True , inferSchema=True)


In [0]:
#Identify the missing value

from pyspark.sql.functions import *

def missing_values(df,df_name):
    print (f'Missinf values in {df_name}:')
    df.select([count(when(col(c).isNull(),1)).alias (c) for c in df.columns]).show()

In [0]:
missing_values(customers_df,'customer')
missing_values(orders_df,'orders')
missing_values(geolocation_df,'geolocation')
missing_values(order_items_df,'order_items')
missing_values(order_payments_df,'order_payments')
missing_values(order_reviews_df,'order_reviews')
missing_values(products_df,'products')
missing_values(sellers_df,'sellers')
missing_values(product_category_name_translation_df,'product_category_name_translation')

Missinf values in customer:
+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+

Missinf values in orders:
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|     

## Handle missing values
1. Drop missing values (for non-critical columns)
2. Fill missing values (for numerical columns)
3. Impute missing values (for continous data)


In [0]:
orders_df_cleaned = orders_df.na.drop(subset=['order_id','customer_id','order_status'])

In [0]:
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [0]:
orders_df_cleaned = orders_df.fillna({'order_delivered_customer_date':'9999-12-31'})

In [0]:
#Impute missing values

In [0]:
from pyspark.sql import functions as F

# Compute the mode (most frequent value) of 'payment_value'
mode_value = (
    order_payments_df
    .groupBy('payment_value')
    .count()
    .orderBy(F.desc('count'))
    .first()['payment_value']
)

# Fill missing values in 'payment_value' with the mode
payments_df_cleaned = order_payments_df.fillna({'payment_value': mode_value})

display(payments_df_cleaned)

order_id,payment_sequential,payment_type,payment_installments,payment_value
b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
298fcdf1f73eb413e4d26d01b25bc1cd,1,credit_card,2,96.12
771ee386b001f06208a7419e4fc1bbd7,1,credit_card,1,81.16
3d7239c394a212faae122962df514ac7,1,credit_card,3,51.84
1f78449c87a54faf9e96e88ba1491fa9,1,credit_card,6,341.09
0573b5e23cbd798006520e1d5b4c6714,1,boleto,1,51.95


In [0]:
#Standardizing the format

In [0]:
def print_schema(df,df_name):
    print (f'Schema for {df_name}:')
    df.printSchema()

In [0]:
print_schema(orders_df,'orders')

Schema for orders:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [0]:
print_schema(order_items_df,'order_items')
print_schema(products_df,'products')
print_schema(customers_df,'customers')
print_schema(sellers_df,'sellers')
print_schema(geolocation_df,'geolocation')
print_schema(order_reviews_df,'order_reviews')
print_schema(order_payments_df,'order_payments')
print_schema(product_category_name_translation_df,'product_category_name_translation')

Schema for order_items:
root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

Schema for products:
root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)

Schema for customers:
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- c

In [0]:
orders_df_cleaned = orders_df_cleaned.withColumn('order_purchase_timestamp',to_date(col('order_purchase_timestamp')))

In [0]:
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|              2017-10-02|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|              2018-07-24|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [0]:
order_payments_df.groupBy('payment_type').count().orderBy('count',ascending=False).show()


+------------+-----+
|payment_type|count|
+------------+-----+
| credit_card|76795|
|      boleto|19784|
|     voucher| 5775|
|  debit_card| 1529|
| not_defined|    3|
+------------+-----+



In [0]:
payments_df_cleaned= order_payments_df.withColumn('payment_type',
                                                  when(col('payment_type')== 'boleto', 'Bank Transfer')
                                                  .when(col('payment_type') == 'credit_card', 'Credit Card')
                                                  .when(col('payment_type') == 'debit_card', 'Debit Card')
                                                  .otherwise('other'))

In [0]:
payments_df_cleaned.show()

+--------------------+------------------+-------------+--------------------+-------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|
+--------------------+------------------+-------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1|  Credit Card|                   8|        99.33|
|a9810da82917af2d9...|                 1|  Credit Card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1|  Credit Card|                   1|        65.71|
|ba78997921bbcdc13...|                 1|  Credit Card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1|  Credit Card|                   2|       128.45|
|298fcdf1f73eb413e...|                 1|  Credit Card|                   2|        96.12|
|771ee386b001f0620...|                 1|  Credit Card|                   1|        81.16|
|3d7239c394a212faa...|                 1|  Credit Card|                   3|        51.84|

In [0]:
customers_df_cleaned = customers_df.withColumn('customer_zip_code_prefix',col('customer_zip_code_prefix').cast('string'))

In [0]:
customers_df_cleaned.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [0]:
#remove duplicate result

### Data Transformation

In [0]:
customers_df_cleaned = customers_df_cleaned.dropDuplicates()


In [0]:
order_with_details = orders_df_cleaned.join(order_items_df,'order_id','left')\
    .join(payments_df_cleaned,'order_id','left')\
    .join(customers_df_cleaned,'customer_id','left')

In [0]:
order_with_details.show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+------------------+------------+--------------------+-------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|payment_sequential|payment_type|payment_installments|payment_value|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------+---------------------------

In [0]:
order_with_total_value = order_with_details.groupBy('order_id') \
  .agg(sum('payment_value').alias('total_order_value'))

In [0]:
order_with_total_value.show(5)

+--------------------+-----------------+
|            order_id|total_order_value|
+--------------------+-----------------+
|ccbabeb0b02433bd0...|             43.0|
|c6bf92017bd40729c...|            22.29|
|ab87dc5a5f1856a10...|            210.9|
|06ff862a85c2402aa...|            59.53|
|f23155f5fa9b82663...|            35.09|
+--------------------+-----------------+
only showing top 5 rows


In [0]:
#delivery time calculation from order date to delivery date
delivery_df = order_with_details.select('order_id','order_purchase_timestamp','order_delivered_customer_date')
delivery_df.show(5)

+--------------------+------------------------+-----------------------------+
|            order_id|order_purchase_timestamp|order_delivered_customer_date|
+--------------------+------------------------+-----------------------------+
|e481f51cbdc54678b...|              2017-10-02|          2017-10-10 21:25:13|
|53cdb2fc8bc7dce0b...|              2018-07-24|          2018-08-07 15:27:45|
|47770eb9100c2d0c4...|              2018-08-08|          2018-08-17 18:06:29|
|949d5b44dbf5de918...|              2017-11-18|          2017-12-02 00:28:42|
|ad21c59c0840e6cb8...|              2018-02-13|          2018-02-16 18:17:02|
+--------------------+------------------------+-----------------------------+
only showing top 5 rows


Advance transformation

In [0]:
order_items_df.show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35|  58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13| 239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30| 199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18| 12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51| 199.9|        18.14|
|00048cc3ae777c65d...|            1|ef92

In [0]:
quantiles = order_items_df.approxQuantile('price', [0, 0.1, 0.99], 0.0)

low_cutoff, high_cutoff = quantiles[0], quantiles[1]


In [0]:
order_items_df.select('price').summary().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|            112650|
|   mean|120.65373901463542|
| stddev| 183.6339280502594|
|    min|              0.85|
|    25%|              39.9|
|    50%|             74.99|
|    75%|             134.9|
|    max|            6735.0|
+-------+------------------+



In [0]:
low_cutoff, high_cutoff


(0.85, 23.8)

In [0]:
order_items_df_cleaned = order_items_df.filter((col('price') >= low_cutoff) & (col('price') <= high_cutoff))

In [0]:
order_payments_df.select('payment_installments').summary().show()

+-------+--------------------+
|summary|payment_installments|
+-------+--------------------+
|  count|              103886|
|   mean|   2.853348863176944|
| stddev|  2.6870506738564806|
|    min|                   0|
|    25%|                   1|
|    50%|                   1|
|    75%|                   4|
|    max|                  24|
+-------+--------------------+



In [0]:
products_df_cleaned = products_df.withColumn(
    'product_size_category',
    when(col('product_weight_g') < 500 ,'Small')
    .when(col('product_weight_g').between(500,2000),'Medium')
    .otherwise('Large')
)

In [0]:
products_df_cleaned.show(4)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_size_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|                Small|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|        

In [0]:
#total revenue per seller
total_revenue_per_seller = order_items_df_cleaned.groupBy('seller_id').agg(sum('price').alias('total_revenue'))

In [0]:
order_with_details.write.mode('overwrite').parquet('dbfs:/Volumes/workspace/default/data/olist_order_with_details')

In [0]:
order_with_details.display(5)

customer_id,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,payment_sequential,payment_type,payment_installments,payment_value,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
c77ee2d8ba1614a4d489a44166894938,ccbabeb0b02433bd0fcbac46e70339f2,delivered,2018-02-19,2018-02-21T06:15:25.000Z,2018-02-22T21:04:23.000Z,2018-03-09T22:22:25.000Z,2018-03-13T00:00:00.000Z,1.0,89321f94e35fc6d7903d36f74e351d40,16090f2ca825584b5a147ab24aa30c86,2018-02-27T03:31:34.000Z,27.9,15.1,1,Bank Transfer,1,43.0,9c9cef121cb812cb301babddc2d8331e,38067,uberaba,MG
3d3c463710ea6e8dd9a63c1110eeb06b,c6bf92017bd40729c135b58b643f64c2,delivered,2018-08-08,2018-08-09T07:44:53.000Z,2018-08-10T14:43:00.000Z,2018-08-15T00:18:43.000Z,2018-08-16T00:00:00.000Z,1.0,3f1a741cf5591384428c1cbb0ef07ec0,36a968b544695394e4e9d7572688598f,2018-08-13T07:44:53.000Z,14.9,7.39,1,Credit Card,1,22.29,55c00c8a161d2e6d7d731dd87341ad2f,13175,sumare,SP
538a4d02876412846b966a3c057395e5,ab87dc5a5f1856a10640d5f42e4c2fd9,delivered,2018-06-04,2018-06-04T12:50:47.000Z,2018-06-05T12:12:00.000Z,2018-06-11T20:39:49.000Z,2018-07-17T00:00:00.000Z,1.0,f8b624d4e475bb8d1bddf1b65c6a64f6,b410bdd36d5db7a65dcd42b7ead933b8,2018-06-12T12:50:47.000Z,179.0,31.9,1,Credit Card,4,210.9,fd317ae6d8988b7041034edac3f253bf,38040,uberaba,MG
0a978c825ff7d013133ddc7f77566172,06ff862a85c2402aa52dc9edf150bf30,delivered,2017-11-30,2017-12-01T11:30:56.000Z,2017-12-01T17:51:53.000Z,2017-12-28T22:27:57.000Z,2017-12-28T00:00:00.000Z,1.0,4ce9ab528124f89e091b17d11aa2e97c,7e3f87d16fb353f408d467e74fbd8014,2017-12-07T04:49:47.000Z,41.9,17.63,1,Bank Transfer,1,59.53,dc9431e47beadfe1188b67bec3717969,60710,fortaleza,CE
21a99191298d34fb6dd0b088e821591c,f23155f5fa9b826631c5b8e038b38393,delivered,2017-09-20,2017-09-21T02:45:30.000Z,2017-09-26T19:22:57.000Z,2017-10-02T21:43:57.000Z,2017-10-13T00:00:00.000Z,1.0,052b8660ee8a9ee18815d9b276694a10,74c7dec0a384d8a05950e629bd23bde9,2017-09-27T02:45:30.000Z,19.99,15.1,1,Bank Transfer,1,35.09,398fbb883435515de0aefe3887d6ea10,22610,rio de janeiro,RJ
6ad71323c11ba8a83737ccc3ea31fbc3,69fd81b0cd556f5da5000c1ed874ed19,delivered,2017-09-25,2017-09-25T11:05:35.000Z,2017-09-25T18:06:56.000Z,2017-09-28T18:53:14.000Z,2017-10-19T00:00:00.000Z,1.0,28f61ad35fb219e9debd750a73b63985,080102cd0a76b09e0dcf55fcacc60e05,2017-09-29T11:05:35.000Z,42.79,16.79,1,Credit Card,3,59.58,c739d7adb6f103eae180b1f2fc74455f,89284,sao bento do sul,SC
0470c47f1dd7a91d0f3b8a420589e0f7,d40dd8018a5302969efb31bd21744cab,delivered,2017-03-23,2017-03-23T23:10:30.000Z,2017-03-25T09:26:22.000Z,2017-04-06T16:33:45.000Z,2017-04-13T00:00:00.000Z,1.0,2284b28ca179d66957a67ef01a5b7d6c,8e6cc767478edae941d9bd9eb778d77a,2017-04-02T23:10:30.000Z,35.0,14.12,1,Credit Card,4,49.12,27c5a2e7ca50a650565c75052cd5712d,35700,sete lagoas,MG
0f3a81be69f12da7e2979fd1833e923d,42560dfc8d7863a190293678f01f6bbd,delivered,2017-10-22,2017-10-22T01:49:12.000Z,2017-10-23T14:07:54.000Z,2017-10-24T17:09:24.000Z,2017-11-06T00:00:00.000Z,2.0,f5d9f6be389c406755cbe9f20954dd9a,f181738b150df1f37cb0bd72e705b193,2017-10-27T01:49:12.000Z,14.9,7.78,1,Credit Card,1,45.36,bb939798f4c3086488aba189af45cc1c,5427,sao paulo,SP
72d90899884781ae2fc19e49cc102fc0,3f003568147c785083d014edfba38c48,delivered,2018-06-18,2018-06-18T17:20:37.000Z,2018-06-26T15:27:00.000Z,2018-07-02T22:49:09.000Z,2018-07-17T00:00:00.000Z,1.0,8d5ab785e6761f35bf54eca83846dd2e,08d0949a9a17c027262db1f3c450c26c,2018-06-22T17:20:37.000Z,12.9,18.23,1,Credit Card,1,31.13,35da8188d2ec392ebd41e4d29d68c14a,88052,florianopolis,SC
a8f76d9cb0f8db57cbbfe8d67b257893,5691d72069359cd293ac9a14f1bd5e9b,delivered,2018-07-23,2018-07-25T02:55:14.000Z,2018-07-25T13:19:00.000Z,2018-08-02T00:42:53.000Z,2018-08-14T00:00:00.000Z,1.0,8c0fd3e89ebf51f2fcff46abbc4eca05,715bbd5ba4e6b74cb0d2f29eb45058b0,2018-07-30T02:55:14.000Z,34.9,37.15,1,Bank Transfer,1,72.05,3353f4c5877c92dabf4068661b4564fd,77026,palmas,TO


In [0]:
order_with_details.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: date (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- cu