In [0]:
from optimus import Optimus
# Create Optimus instance
op = Optimus()

In [0]:
olist_path = "/mnt/streamingadls/olist"

In [0]:
from pyspark.sql.functions import to_timestamp, to_date, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType, FloatType, LongType

#### DEFINING SCHEMA FOR THE DATASETS AND STREAMING THE DATASETS

In [0]:
customers_schema = reviews_schema = StructType(fields=[StructField("customer_id", StringType(), False),
                                     StructField("customer_unique_id", StringType(), True),
                                     StructField("customer_zip_code_prefix", IntegerType(), True),
                                     StructField("customer_city", StringType(), True),
                                     StructField("customer_state", StringType(), True),
                                     StructField("timestamp", TimestampType(), True)
                       ])

customers_df = (spark.readStream.option("header",True)\
                .option("maxFilesPerTrigger", 1)\
                .schema(customers_schema)\
                .csv(f"{olist_path}/customers")\
                .withColumn("timestamp", current_timestamp())
                .withWatermark("timestamp", "5 minutes"))
print(customers_df.isStreaming)

In [0]:
geolocation_schema = StructType(fields=[StructField("geolocation_zip_code_prefix", IntegerType(), False),
                                     StructField("geolocation_lat", DoubleType(), True),
                                     StructField("geolocation_lng", DoubleType(), True),
                                     StructField("geolocation_city", StringType(), True),
                                     StructField("geolocation_state", StringType(), True)
                       ])

geolocation_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(geolocation_schema).csv(f"{olist_path}/geolocation"))
print(geolocation_df.isStreaming)

In [0]:
order_items_schema = StructType(fields=[StructField("order_id", StringType(), False),
                                     StructField("order_item_id", IntegerType(), True),
                                     StructField("product_id", StringType(), True),
                                     StructField("seller_id", StringType(), True),
                                     StructField("shipping_limit_date", TimestampType(), True),
                                     StructField("price", DoubleType(), True),
                                     StructField("freight_value", DoubleType(), True),
                       ])

order_items_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(order_items_schema).csv(f"{olist_path}/order items"))
print(order_items_df.isStreaming)

In [0]:
order_payments_schema = StructType(fields=[StructField("order_id", StringType(), False),
                                     StructField("payment_sequential", IntegerType(), True),
                                     StructField("payment_type", StringType(), True),
                                     StructField("payment_installments", IntegerType(), True),
                                     StructField("payment_value", DoubleType(), True)
                                 
                       ])

order_payments_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(order_payments_schema).csv(f"{olist_path}/order payments"))
print(order_payments_df.isStreaming)

In [0]:
order_reviews_schema = StructType(fields=[StructField("review_id", StringType(), False),
                                     StructField("order_id", StringType(), True),
                                     StructField("review_score", IntegerType(), True),
                                     StructField("review_comment_title", StringType(), True),
                                     StructField("review_comment_message", StringType(), True),
                                     StructField("review_creation_date", DateType(), True),
                                     StructField("review_answer_timestamp", TimestampType(), True)
                       ])

order_reviews_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(order_reviews_schema).csv(f"{olist_path}/order reviews"))
print(order_reviews_df.isStreaming)

In [0]:
orders_schema = StructType(fields=[StructField("order_id", StringType(), False),
                                     StructField("customer_id", StringType(), True),
                                     StructField("order_status", StringType(), True),
                                     StructField("order_purchase_timestamp", TimestampType(), True),
                                     StructField("order_approved_at", TimestampType(), True),
                                     StructField("order_delivered_carrier_date", TimestampType(), True),
                                     StructField("order_delivered_customer_date", TimestampType(), True),
                                     StructField("order_estimated_delivery_date", DateType(), True)
                       ])

orders_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(orders_schema).csv(f"{olist_path}/orders"))
print(orders_df.isStreaming)

In [0]:
products_schema = StructType(fields=[StructField("product_id", StringType(), False),
                                     StructField("product_category_name", StringType(), True),
                                     StructField("product_name_lenght", IntegerType(), True),
                                     StructField("product_description_lenght", IntegerType(), True),
                                     StructField("product_photos_qty", IntegerType(), True),
                                     StructField("product_weight_g", IntegerType(), True),
                                     StructField("product_length_cm", IntegerType(), True),
                                     StructField("product_height_cm", IntegerType(), True),
                                     StructField("product_width_cm", IntegerType(), True)
                       ])

products_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(products_schema).csv(f"{olist_path}/products"))
print(products_df.isStreaming)

In [0]:
sellers_schema = StructType(fields=[StructField("seller_id", StringType(), False),
                                    StructField("seller_zip_code_prefix", IntegerType(), True),
                                    StructField("seller_city", StringType(), False),
                                    StructField("seller_state", StringType(), False)
                       ])

sellers_df = (spark.readStream.option("header",True).option("maxFilesPerTrigger", 1).schema(sellers_schema).csv(f"{olist_path}/sellers"))
print(sellers_df.isStreaming)

In [0]:
display(customers_df)

customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,timestamp
06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,2022-01-20T14:33:51.249+0000
18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,2022-01-20T14:33:51.249+0000
4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,2022-01-20T14:33:51.249+0000
b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,2022-01-20T14:33:51.249+0000
4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,2022-01-20T14:33:51.249+0000
879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC,2022-01-20T14:33:51.249+0000
fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP,2022-01-20T14:33:51.249+0000
5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG,2022-01-20T14:33:51.249+0000
5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR,2022-01-20T14:33:51.249+0000
4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG,2022-01-20T14:33:51.249+0000


In [0]:
display(geolocation_df)

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1046,-23.54608112703553,-46.64482029837157,sao paulo,SP
1046,-23.54612896641469,-46.64295148361138,sao paulo,SP
1041,-23.5443921648681,-46.63949930627844,sao paulo,SP
1035,-23.541577961711493,-46.64160722329613,sao paulo,SP
1012,-23.547762303364262,-46.63536053788448,são paulo,SP
1047,-23.54627311241268,-46.64122516971552,sao paulo,SP
1013,-23.546923208436723,-46.6342636964915,sao paulo,SP
1029,-23.543769055769133,-46.63427784085132,sao paulo,SP
1011,-23.547639550320632,-46.63603162315495,sao paulo,SP


In [0]:
display(order_items_df)

order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19T09:45:35.000+0000,58.9,13.29
00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03T11:05:13.000+0000,239.9,19.93
000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18T14:48:30.000+0000,199.0,17.87
00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15T10:10:18.000+0000,12.99,12.79
00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13T13:57:51.000+0000,199.9,18.14
00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23T03:55:27.000+0000,21.9,12.69
00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14T12:10:31.000+0000,19.9,11.85
000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10T12:30:45.000+0000,810.0,70.75
0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26T18:31:29.000+0000,145.95,11.65
0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06T14:10:56.000+0000,53.99,11.4


In [0]:
display(order_payments_df)

order_id,payment_sequential,payment_type,payment_installments,payment_value
b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
298fcdf1f73eb413e4d26d01b25bc1cd,1,credit_card,2,96.12
771ee386b001f06208a7419e4fc1bbd7,1,credit_card,1,81.16
3d7239c394a212faae122962df514ac7,1,credit_card,3,51.84
1f78449c87a54faf9e96e88ba1491fa9,1,credit_card,6,341.09
0573b5e23cbd798006520e1d5b4c6714,1,boleto,1,51.95


In [0]:
display(order_reviews_df)

review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4.0,,,2018-01-18,2018-01-18T21:46:59.000+0000
80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5.0,,,2018-03-10,2018-03-11T03:05:13.000+0000
228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5.0,,,2018-02-17,2018-02-18T14:36:24.000+0000
e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5.0,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21T22:02:06.000+0000
f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5.0,,Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa,2018-03-01,2018-03-02T10:26:53.000+0000
15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1.0,,,2018-04-13,2018-04-16T00:39:37.000+0000
07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5.0,,,2017-07-16,2017-07-18T19:30:34.000+0000
7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5.0,,,2018-08-14,2018-08-14T21:36:06.000+0000
a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5.0,,,2017-05-17,2017-05-18T12:05:37.000+0000
8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4.0,recomendo,aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome...atualizar com a marca correta uma vez que é o mesmo aparelho,2018-05-22,2018-05-23T16:45:47.000+0000


In [0]:
display(orders_df)

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33.000+0000,2017-10-02T11:07:15.000+0000,2017-10-04T19:55:00.000+0000,2017-10-10T21:25:13.000+0000,2017-10-18
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37.000+0000,2018-07-26T03:24:27.000+0000,2018-07-26T14:31:00.000+0000,2018-08-07T15:27:45.000+0000,2018-08-13
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49.000+0000,2018-08-08T08:55:23.000+0000,2018-08-08T13:50:00.000+0000,2018-08-17T18:06:29.000+0000,2018-09-04
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06.000+0000,2017-11-18T19:45:59.000+0000,2017-11-22T13:39:59.000+0000,2017-12-02T00:28:42.000+0000,2017-12-15
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39.000+0000,2018-02-13T22:20:29.000+0000,2018-02-14T19:46:34.000+0000,2018-02-16T18:17:02.000+0000,2018-02-26
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09T21:57:05.000+0000,2017-07-09T22:10:13.000+0000,2017-07-11T14:58:04.000+0000,2017-07-26T10:57:55.000+0000,2017-08-01
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11T12:22:08.000+0000,2017-04-13T13:25:17.000+0000,,,2017-05-09
6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16T13:10:30.000+0000,2017-05-16T13:22:11.000+0000,2017-05-22T10:07:46.000+0000,2017-05-26T12:55:51.000+0000,2017-06-07
76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23T18:29:09.000+0000,2017-01-25T02:50:47.000+0000,2017-01-26T14:16:31.000+0000,2017-02-02T14:08:10.000+0000,2017-03-06
e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29T11:55:02.000+0000,2017-07-29T12:05:32.000+0000,2017-08-10T19:45:24.000+0000,2017-08-16T17:14:30.000+0000,2017-08-23


In [0]:
display(products_df)

product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225,16,10,14
3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000,30,18,20
96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154,18,9,15
cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371,26,4,26
9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625,20,17,13
41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60.0,745.0,1.0,200,38,5,11
732bd381ad09e530fe0a5f457d81becb,cool_stuff,56.0,1272.0,4.0,18350,70,24,44
2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56.0,184.0,2.0,900,40,8,40
37cc742be07708b53a98702e77a21a02,eletrodomesticos,57.0,163.0,1.0,400,27,13,17
8c92109888e8cdf9d66dc7e463025574,brinquedos,36.0,1156.0,1.0,600,17,10,12


In [0]:
display(sellers_df)

seller_id,seller_zip_code_prefix,seller_city,seller_state
3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


In [0]:
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, expr

In [0]:
customers_selected_df = customers_df.cols.select(["customer_id", "customer_city", "customer_state"])\
                                    .withColumnRenamed("customer_id", "c_id")\
                                    .withColumn("timestamp", current_timestamp())\
                                    .withWatermark("timestamp", "5 minutes")
geolocation_selected_df = geolocation_df.select('geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state')\
                                        .withColumn("timestamp", current_timestamp())\
                                        .withWatermark("timestamp", "5 minutes")
order_items_selected_df = order_items_df.selectExpr("order_id AS oi_id", 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price')\
.withColumn("timestampOI", current_timestamp())\
                                        .withWatermark("timestampOI", "5 minutes ")

#select('order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price')
order_reviews_seleced_df = order_reviews_df.cols.select(["review_id", "order_id", 
                                                         "review_score", "review_creation_date", 
                                                         "review_answer_timestamp"])\
                                                .withColumn("timestamp", current_timestamp())\
                                                .withWatermark("timestamp", "5 minutes")
orders_selected_df = orders_df.selectExpr('order_id AS o_id', 'customer_id', 'order_status', 'order_purchase_timestamp','order_delivered_customer_date')\
.withColumn("timestampO", current_timestamp())\
                              .withWatermark("timestampO", "10 minutes")

#select('order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_delivered_customer_date')
products_selected_df = products_df.selectExpr('product_id AS p_id', 'product_category_name')\
                                  .withColumn("timestamp", current_timestamp())\
                                  .withWatermark("timestamp", "5 minutes")
sellers_selected_df = sellers_df.select('*')\
.withColumnRenamed("seller_id", "s_id")\
.withColumn("timestamp", current_timestamp())\
.withWatermark("timestamp", "5 minutes")

##### JOINING ALL DFs

In [0]:
joined_df = orders_selected_df.join(order_items_selected_df, expr("""o_id = oi_id AND
                                                                     timestampO >= timestampOI - interval 1 minutes AND
                                                                     timestampO <= timestampOI + interval 1 minutes
                                                                  """), "inner")\
.withColumnRenamed("product_id", "pr_id")\
.withColumnRenamed("seller_id", "sr_id")\
.withColumnRenamed("customer_id", "cr_id")

In [0]:
joined_df = joined_df.join(products_selected_df, expr("""p_id = pr_id"""), "outer")

In [0]:
joined_df = joined_df.join(sellers_selected_df, expr("""s_id = sr_id"""), "outer")

In [0]:
display(joined_df)

In [0]:
res = joined_df.writeStream \
    .outputMode("append")\
    .format("console") \
    .start()

In [0]:
from pyspark.sql.functions import max

In [0]:
priceGrouped = joined_df.groupby('o_id', 'price').sum()

In [0]:
res = priceGrouped.writeStream \
    .outputMode("append")\
    .format("console") \
    .start()

In [0]:
display(res)

In [0]:
joined_df.createOrReplaceTempView("records")

In [0]:
results = spark.sql("SELECT * FROM records")

In [0]:
display(results)