In [52]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("practice").getOrCreate()

In [91]:
orders = spark.read.format("parquet")\
                    .options(header=True)\
                    .load("file:///home/tamaghna/big_data_spark/sales_parquet")

In [75]:
orders.show(5)

[Stage 34:>                                                         (0 + 1) / 1]

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
+--------+----------+---------+----------+---------------+--------------------+
only showing top 5 rows



                                                                                

In [55]:
products = spark.read.format("parquet")\
                        .options(header=True)\
                        .load("file:///home/tamaghna/big_data_spark/products_parquet")

In [56]:
products.show(5)

[Stage 27:>                                                         (0 + 1) / 1]

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
+----------+------------+-----+
only showing top 5 rows



                                                                                

In [57]:
sellers = spark.read.format("parquet")\
                    .options(header=True)\
                    .load("file:///home/tamaghna/big_data_spark/sellers_parquet")

In [58]:
sellers.show()

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
|        3|   seller_3|      310462|
|        4|   seller_4|     1532808|
|        5|   seller_5|     1199693|
|        6|   seller_6|     1055915|
|        7|   seller_7|     1946998|
|        8|   seller_8|      547320|
|        9|   seller_9|     1318051|
+---------+-----------+------------+



In [11]:
# Find out how many orders, how many products and how many sellers are in the data.

In [59]:
orders.count()

                                                                                

20000040

In [13]:
products.count()

75000000

In [14]:
sellers.count()

10

In [15]:
# How many products have been sold at least once? Which is the product contained in more orders?

In [17]:
orders.select("product_id").distinct().count()

                                                                                

993429

In [18]:
# How many distinct products have been sold in each day?

In [67]:
orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- num_pieces_sold: string (nullable = true)
 |-- bill_raw_text: string (nullable = true)



In [68]:
from pyspark.sql.types import IntegerType,StringType
from pyspark.sql.functions import to_date

In [100]:
orders = orders.withColumn("order_id",orders.order_id.cast(IntegerType()))\
        .withColumn("product_id",orders.product_id.cast(IntegerType()))\
        .withColumn("seller_id",orders.seller_id.cast(IntegerType()))\
        .withColumn("date",to_date(orders.date,"yyyy-MM-dd"))\
        .withColumn("num_pieces_sold",orders.num_pieces_sold.cast(IntegerType()))\
        .withColumn("bill_raw_text",orders.bill_raw_text.cast(StringType()))

In [102]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- num_pieces_sold: integer (nullable = true)
 |-- bill_raw_text: string (nullable = true)



In [101]:
orders.show()

[Stage 47:>                                                         (0 + 1) / 1]

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|       9|         0|        0|2020-07-10|             25|jnykelwjjebgkwgmu...|
|      10|         0|        0|2020-07-0

                                                                                

In [114]:
orders.select("product_id","date").distinct().groupBy("date").count().orderBy("date").show()



+----------+------+
|      date| count|
+----------+------+
|2020-07-01|100337|
|2020-07-02| 99807|
|2020-07-03|100017|
|2020-07-04| 99791|
|2020-07-05| 99796|
|2020-07-06|100765|
|2020-07-07| 99756|
|2020-07-08| 99662|
|2020-07-09|100501|
|2020-07-10| 98973|
+----------+------+



