In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime

In [3]:
spark=SparkSession.builder \
    .appName("Spark") \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
schema=StructType([
    StructField("order_id ",StringType(),True),
    StructField("order_item_id ",IntegerType(),True),
    StructField("product_id ",StringType(),True),
    StructField("seller_id",StringType(),True),
    StructField("shipping_limit_date",TimestampType(),True),
    StructField("price",DoubleType(),True),
    StructField("freight_value",DoubleType(),True),
])

path="data/order_items_dataset.csv"
df=spark.read.format('csv').option('header','true').option('inferSchema','false').schema(schema).load(path)

df.printSchema()
df.show(5)


root
 |-- order_id : string (nullable = true)
 |-- order_item_id : integer (nullable = true)
 |-- product_id : string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+
|           order_id |order_item_id |         product_id |           seller_id|shipping_limit_date|price|freight_value|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|             1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|             1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|             1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01

In [7]:
df.rdd.getNumPartitions()

4

In [8]:
df_repartitions=df.repartition(10)

In [10]:
df_repartitions.rdd.getNumPartitions()

10

In [11]:
from pyspark.sql.functions import *

In [13]:
df.select('order_id ').show(5)

+--------------------+
|           order_id |
+--------------------+
|00010242fe8c5a6d1...|
|00018f77f2f0320c5...|
|000229ec398224ef6...|
|00024acbcdf0a6daa...|
|00042b26cf59d7ce6...|
+--------------------+
only showing top 5 rows



In [18]:
df2=df.withColumn("year",year(col("shipping_limit_date"))).withColumn("month",month(col("shipping_limit_date")))
df2.show(5,truncate=False)

+--------------------------------+--------------+--------------------------------+--------------------------------+-------------------+-----+-------------+----+-----+
|order_id                        |order_item_id |product_id                      |seller_id                       |shipping_limit_date|price|freight_value|year|month|
+--------------------------------+--------------+--------------------------------+--------------------------------+-------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1ba2dd792cb16214|1             |4244733e06e7ecb4970a6e2683c13e61|48436dade18ac8b2bce089ec2a041202|2017-09-19 09:45:35|58.9 |13.29        |2017|9    |
|00018f77f2f0320c557190d7a144bdd3|1             |e5f2d52b802189ee658865ca93d83a8f|dd7ddc04e1b6c2c614352b383efe2d36|2017-05-03 11:05:13|239.9|19.93        |2017|5    |
|000229ec398224ef6ca0657da4fc703e|1             |c777355d18b72b67abbeef9df44fd0fd|5b51032eddd242adc84c38acab88f23d|2018-01-18 14:48:30|199.0|17.87        |2018|1    

In [20]:
order_li=["00010242fe8c5a6d1ba2dd792cb16214","00018f77f2f0320c557190d7a144bdd3"]

df2.filter(col("order_id ").isin(order_li)).show()

+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|           order_id |order_item_id |         product_id |           seller_id|shipping_limit_date|price|freight_value|year|month|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1...|             1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|2017|    9|
|00018f77f2f0320c5...|             1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|2017|    5|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+



In [21]:
df2.drop('month').show(5)

+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+
|           order_id |order_item_id |         product_id |           seller_id|shipping_limit_date|price|freight_value|year|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+
|00010242fe8c5a6d1...|             1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|2017|
|00018f77f2f0320c5...|             1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|2017|
|000229ec398224ef6...|             1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30|199.0|        17.87|2018|
|00024acbcdf0a6daa...|             1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18|12.99|        12.79|2018|
|00042b26cf59d7ce6...|             1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51|199.9|        18.14|2017|


In [23]:
df2.dropDuplicates(['order_id ','order_item_id ']).show(10)

+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|           order_id |order_item_id |         product_id |           seller_id|shipping_limit_date|price|freight_value|year|month|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|00018f77f2f0320c5...|             1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|2017|    5|
|0015ebb40fb17286b...|             1|50fd2b788dc166edd...|8b321bb669392f516...|2018-01-18 09:11:24| 21.9|         15.1|2018|    1|
|001dbc16dc51075e9...|             1|777d2e438a1b645f3...|4a3ca9315b744ce9f...|2017-02-01 13:17:57| 69.9|         18.0|2017|    2|
|0028de0ca693a1bb2...|             1|059344baebbeaa42f...|955fee9216a65b617...|2018-08-21 03:35:17|29.99|        15.31|2018|    8|
|002c9def9c9b951b1...|             1|2d9ff06c8870a518f...|00720abe85ba08598...|2018

In [24]:
df2.orderBy(col('price').asc(),col('freight_value').desc()).show(10)

+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|           order_id |order_item_id |         product_id |           seller_id|shipping_limit_date|price|freight_value|year|month|
+--------------------+--------------+--------------------+--------------------+-------------------+-----+-------------+----+-----+
|c5bdd8ef3c0ec4202...|             2|8a3254bee785a526d...|96804ea39d96eb908...|2018-05-07 02:55:22| 0.85|         22.3|2018|    5|
|6e864b3f0ec710311...|             1|8a3254bee785a526d...|96804ea39d96eb908...|2018-05-02 20:30:34| 0.85|        18.23|2018|    5|
|3ee6513ae7ea23bdf...|             1|8a3254bee785a526d...|96804ea39d96eb908...|2018-05-04 03:55:26| 0.85|        18.23|2018|    5|
|8272b63d03f5f79c5...|            19|270516a3f41dc035a...|2709af9587499e95e...|2017-07-21 18:25:23|  1.2|         7.89|2017|    7|
|8272b63d03f5f79c5...|            10|05b515fdc76e888aa...|2709af9587499e95e...|2017

In [29]:
df2.groupBy('year','month').agg(count('*'),avg('price'),min('price'),max('price')).orderBy(col('year').asc(),col('month').desc()).show()

+----+-----+--------+------------------+----------+----------+
|year|month|count(1)|        avg(price)|min(price)|max(price)|
+----+-----+--------+------------------+----------+----------+
|2016|   12|       1|              10.9|      10.9|      10.9|
|2016|   10|     365|135.83712328767106|       6.0|    1399.0|
|2016|    9|       4|           48.6175|     44.99|      59.5|
|2017|   12|    7726|116.35011390111136|       4.4|    3124.0|
|2017|   11|    7355|120.10219306593969|      3.85|    2990.0|
|2017|   10|    5189|126.81060512622734|       4.5|   2999.99|
|2017|    9|    4724|130.61941574936384|      2.29|    1798.0|
|2017|    8|    5042|111.08554938516372|       3.9|    2649.0|
|2017|    7|    4116|113.04229834791019|       1.2|   2999.89|
|2017|    6|    3801|123.38885819521138|      3.49|    6499.0|
|2017|    5|    4150|121.84468915662596|       3.5|    4690.0|
|2017|    4|    2364|130.35038917089682|       4.9|    4799.0|
|2017|    3|    2751|124.77011995637947|       4.9|    

In [30]:

# Working with different partitions
accum=spark.sparkContext.accumulator(0)
df2.foreach(lambda row:accum.add(row['price']))

print(accum.value)

13591643.699999392
