In [1]:
%run utils.ipynb

In [65]:
spark = get_spark(catalog="iceberg", storage="lakehouse.io")

In [63]:
# orders

schema = (
    "order_id string,"
    "customer_id string,"
    "order_status string,"
    "order_purchase_timestamp timestamp,"
    "order_approved_at timestamp,"
    "order_delivered_carrier_date timestamp,"
    "order_delivered_customer_date timestamp,"
    "order_estimated_delivery_date timestamp"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/orders/orders.2017-01-01.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)

+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
+--------+-----------+------------+------------------------+-

In [59]:
# iceberg.ecommerce.orders

idf = spark.sql("select * from iceberg.ecommerce.orders")
idf.printSchema()
idf.show(2)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+----+-----+---+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|year|month|day|
+--------------------+--------------------

In [35]:
# order_items

schema = (
    "order_id string,"
    "order_item_id int,"
    "product_id string,"
    "seller_id string,"
    "shipping_limit_date timestamp,"
    "price float,"
    "freight_value float"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/order_items/order_items.2017-01-12.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: float (nullable = true)
 |-- freight_value: float (nullable = true)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|4d9aeb0219e736ad9...|            1|9fa0c72476690fbc6...|e2aee0892199b1d92...|2017-01-16 00:06:12| 39.9|        10.96|
|53cd36155a7bf5070...|            1|0ab80f38a21093b44...|440dd6ab244315c63...|2017-01-16 09:45:32| 19.9|        10.96|
+--------------------+-------------+--------------------+--------------------+------------------

In [43]:
# order_reviews

schema = (
    "order_id string,"
    "review_id string,"
    "review_answer_timestamp timestamp,"
    "review_comment_message string,"
    "review_comment_title string,"
    "review_creation_date timestamp,"
    "review_score int"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/order_reviews/order_reviews.2017-01-12.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_creation_date: timestamp (nullable = true)
 |-- review_score: integer (nullable = true)

+--------------------+--------------------+-----------------------+----------------------+--------------------+--------------------+------------+
|            order_id|           review_id|review_answer_timestamp|review_comment_message|review_comment_title|review_creation_date|review_score|
+--------------------+--------------------+-----------------------+----------------------+--------------------+--------------------+------------+
|ec7a019261fce4418...|5f45d6aa32336fa26...|    2017-01-13 20:22:46|  I really loved th...|                NULL| 2017-01-12 00:00:00|           5|
+--------------------+--------------------+----------------------

In [36]:
# order_payments

schema = (
    "order_id string,"
    "payment_sequential int,"
    "payment_type string,"
    "payment_installments int,"
    "payment_value float"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/order_payments/order_payments.2017-01-12.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: float (nullable = true)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|53cd36155a7bf5070...|                 1| credit_card|                   1|        30.86|
|f313be47c42e07616...|                 1| credit_card|                   7|        74.49|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 2 rows



In [45]:
# customers

schema = (
    "customer_id string,"
    "customer_unique_id string,"
    "customer_zip_code_prefix int,"
    "customer_city string,"
    "customer_state string"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/customers/customers.csv")
)

df.printSchema()
df.show(2)

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 2 rows



In [51]:
# geolocation

schema = (
    "geolocation_zip_code_prefix int,"
    "geolocation_lat double,"
    "geolocation_lng double,"
    "geolocation_city string,"
    "geolocation_state string"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/geolocation/geolocation.csv")
)

df.printSchema()
df.show(2)

root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                       1037| -23.54562128115268|-46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535|-46.64482029837157|       sao paulo|               SP|
+---------------------------+-------------------+------------------+----------------+-----------------+
only showing top 2 rows



In [54]:
# products

schema = (
    "product_id string,"
    "product_category_name string,"
    "product_name_lenght int,"
    "product_description_lenght int,"
    "product_photos_qty int,"
    "product_weight_g int,"
    "product_length_cm int,"
    "product_height_cm int,"
    "product_width_cm int"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/products/products.csv")
)

df.printSchema()
df.show(2)

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+--------------

In [64]:
# sellers

schema = (
    "seller_id string,"
    "seller_zip_code_prefix int,"
    "seller_city string,"
    "seller_state string"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/staging/sellers/sellers.csv")
)

df.printSchema()
df.show(2)

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: integer (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)

+--------------------+----------------------+-----------+------------+
|           seller_id|seller_zip_code_prefix|seller_city|seller_state|
+--------------------+----------------------+-----------+------------+
|3442f8959a84dea7e...|                 13023|   campinas|          SP|
|d1b65fc7debc3361e...|                 13844| mogi guacu|          SP|
+--------------------+----------------------+-----------+------------+
only showing top 2 rows

