In [0]:
%run "../includes/configurations"

In [0]:
from pyspark.sql.functions import to_timestamp, to_date
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

In [0]:
reviews_schema = StructType(fields=[StructField("review_id", StringType(), False),
                                     StructField("order_id", StringType(), True),
                                     StructField("review_score", IntegerType(), True),
                                     StructField("review_comment_title", StringType(), True),
                                     StructField("review_comment_message", StringType(), True),
                                     StructField("review_creation_date", DateType(), True),
                                     StructField("review_answer_timestamp", TimestampType(), True)
                       ])

In [0]:
reviews_df = spark.read.option("header", True)\
.option("inferSchema", True)\
.schema(reviews_schema)\
.csv(f"{raw_folder_path}/olist_order_reviews_dataset.csv")

In [0]:
reviews_df.printSchema()

In [0]:
reviews_df.cols.count_uniques('review_id', estimate=False)

In [0]:
reviews_df.select("review_id").distinct().count()

In [0]:
reviews_df.count()

In [0]:
reviews_cleaned_df = reviews_df.filter(reviews_df.review_score.isNotNull())

In [0]:
reviews_cleaned_df.cols.count_uniques('review_id')

In [0]:
reviews_cleaned_df.count()

In [0]:
display(reviews_cleaned_df.take(10))

review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18T21:46:59.000+0000
80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11T03:05:13.000+0000
228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18T14:36:24.000+0000
e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21T22:02:06.000+0000
f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa,2018-03-01,2018-03-02T10:26:53.000+0000
15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1,,,2018-04-13,2018-04-16T00:39:37.000+0000
07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5,,,2017-07-16,2017-07-18T19:30:34.000+0000
7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5,,,2018-08-14,2018-08-14T21:36:06.000+0000
a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5,,,2017-05-17,2017-05-18T12:05:37.000+0000
8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome...atualizar com a marca correta uma vez que é o mesmo aparelho,2018-05-22,2018-05-23T16:45:47.000+0000


#### SELECTING ONLY REQUIRED COLUMNS

In [0]:
from pyspark.sql.functions import col

In [0]:
reviews_seleced_df = reviews_cleaned_df.select(col("review_id"), col("order_id"), 
                                       col("review_score"), col("review_creation_date"),
                                       col("review_answer_timestamp"))

In [0]:
type(reviews_seleced_df)

In [0]:
display(reviews_seleced_df.take(10))

review_id,order_id,review_score,review_creation_date,review_answer_timestamp
7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,2018-01-18,2018-01-18T21:46:59.000+0000
80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,2018-03-10,2018-03-11T03:05:13.000+0000
228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,2018-02-17,2018-02-18T14:36:24.000+0000
e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,2017-04-21,2017-04-21T22:02:06.000+0000
f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,2018-03-01,2018-03-02T10:26:53.000+0000
15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1,2018-04-13,2018-04-16T00:39:37.000+0000
07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5,2017-07-16,2017-07-18T19:30:34.000+0000
7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5,2018-08-14,2018-08-14T21:36:06.000+0000
a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5,2017-05-17,2017-05-18T12:05:37.000+0000
8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,2018-05-22,2018-05-23T16:45:47.000+0000


In [0]:
reviews_seleced_df.write.mode("overwrite").parquet(f"{processed_folder_path}/reviews")

In [0]:
df = spark.read.parquet("/mnt/dlbrazilecomm2/processed/reviews")

In [0]:
display(df.take(10))

review_id,order_id,review_score,review_creation_date,review_answer_timestamp
77e4c11fcfe64c15dba683178b461a7b,321ebafc185998f3acb39280b759b177,5,2017-07-07,2017-07-08T11:39:35.000+0000
c5cef741ecac225e8ec7d915be82d5e7,c24a00c414738de9ee6121f179ebba9c,5,,
d75d46e5cf89d8bd89a2c6576b03f8d9,fa94f0487376b26326f5fa63d24c9a8b,5,2017-11-28,2017-11-29T11:45:50.000+0000
ad866a384cdb53ae3c072116d399e97b,f9649c12307baa46d80875cd558b3e36,5,2018-03-02,2018-03-04T21:01:49.000+0000
7bf33248d5b2bc70dadc16fe9fef08cb,8c0af913eb525b355c4997a2a4166696,4,2018-01-24,2018-01-27T10:51:57.000+0000
c8895855797fa891d36a4baf086e5d87,09245c021d29cc570d8d5ca0aefd0f22,3,2017-12-04,2017-12-07T13:13:43.000+0000
59caeab34efa8b7016400c2332f82155,d6f81c5ff8389221e5c1d73ba937d7f9,4,2018-06-15,2018-06-15T19:07:43.000+0000
1d29949509375f5bd9969dd3545514d3,d0a3f2251346c62e270f1680cbac775d,2,2018-06-06,2018-06-08T11:27:39.000+0000
4bb46cd48acae7ffdbbafba88f2f95fa,3d260147da4c422ec7b613fcb54c8fc9,5,2017-12-29,2017-12-29T13:56:04.000+0000
2a4c4fd81f49232b6e134bd4655ceca4,2457f30493cdc677ad02b57dd6d17285,4,2017-02-01,2017-02-01T19:46:11.000+0000


In [0]:
df.count()

In [0]:
df.coalesce(1).write.mode('overwrite').option("header", "true").csv(f'{processed_folder_path}/csv/reviews_processed.csv')