In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("spark internals-non-splittable")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [25]:
spark.sparkContext.defaultParallelism

2

In [26]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '

In [27]:
df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [28]:
df.createOrReplaceTempView("orders")

In [29]:
df.rdd.getNumPartitions()

9

In [30]:
new_df = df.repartition(1)

In [31]:
new_df.rdd.getNumPartitions()

1

In [9]:
new_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|   30479|2014-01-30 00:00:...|       9265|       COMPLETE|
|   30480|2014-01-30 00:00:...|         26|        PENDING|
|   30481|2014-01-30 00:00:...|       9240|       COMPLETE|
|   30482|2014-01-30 00:00:...|       9819|SUSPECTED_FRAUD|
|   30483|2014-01-30 00:00:...|       1257|PENDING_PAYMENT|
|   30484|2014-01-30 00:00:...|       2876|       COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|       COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|       COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|       COMPLETE|
|   30488|2014-01-30 00:00:...|      10541|         CLOSED|
|   30489|2014-01-30 00:00:...|       5717|       COMPLETE|
|   30490|2014-01-30 00:00:...|      12189|       COMPLETE|
|   30491|2014-01-30 00:00:...|       2553|         CLOSED|
|   30492|2014-01-30 00:00:...|       37

In [10]:
new_df.write.\
format("csv").\
mode("overwrite").\
option("codec","org.apache.hadoop.io.compress.GzipCodec").\
save("orders_gz")

In [11]:
df2=spark.read.format("csv").schema(order_schema).load("orders_gz")

In [12]:
df2.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [13]:
df2.rdd.getNumPartitions()

1

In [14]:
new_df.write.\
format("csv").\
mode("overwrite").\
option("codec","snappy").\
save("orders_snappy")

In [15]:
df3=spark.read.format("csv").schema(order_schema).load("orders_snappy")

In [16]:
df3.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|   51049|2014-06-09 00:00:...|       4983|     PROCESSING|
|   51050|2014-06-09 00:00:...|       1840|        ON_HOLD|
|   51051|2014-06-09 00:00:...|       8207|       COMPLETE|
|   51052|2014-06-09 00:00:...|       6254|       COMPLETE|
|   51053|2014-06-09 00:00:...|        348|        PENDING|
|   51054|2014-06-09 00:00:...|       1468|       COMPLETE|
|   51055|2014-06-09 00:00:...|       3843|PENDING_PAYMENT|
|   51056|2014-06-09 00:00:...|       7178|PENDING_PAYMENT|
|   51057|2014-06-09 00:00:...|        749|       COMPLETE|
|   51058|2014-06-09 00:00:...|       5146|        PENDING|
|   51059|2014-06-09 00:00:...|       4645|         CLOSED|
|   51060|2014-06-09 00:00:...|        247|       COMPLETE|
|   51061|2014-06-09 00:00:...|       6551|        PENDING|
|   51062|2014-06-09 00:00:...|       55

In [17]:
df3.rdd.getNumPartitions()

1

In [19]:
new_df.write.\
mode("overwrite").\
save("orders_parquet")

In [20]:
df4=spark.read.load("orders_parquet")

In [21]:
df4.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|   12649|2013-10-11 00:00:...|       2789|PENDING_PAYMENT|
|   12650|2013-10-11 00:00:...|       4566|         CLOSED|
|   12651|2013-10-11 00:00:...|        960|       COMPLETE|
|   12652|2013-10-11 00:00:...|       8251|PENDING_PAYMENT|
|   12653|2013-10-11 00:00:...|       5836|        PENDING|
|   12654|2013-10-11 00:00:...|        499|         CLOSED|
|   12655|2013-10-11 00:00:...|       2644|       COMPLETE|
|   12656|2013-10-11 00:00:...|        933|        PENDING|
|   12657|2013-10-11 00:00:...|       1835|     PROCESSING|
|   12658|2013-10-11 00:00:...|       1868|       COMPLETE|
|   12659|2013-10-11 00:00:...|        118|     PROCESSING|
|   12660|2013-10-11 00:00:...|      11615|PENDING_PAYMENT|
|   12661|2013-10-11 00:00:...|       9670|        PENDING|
|   12662|2013-10-11 00:00:...|       64

In [22]:
df4.rdd.getNumPartitions()

2

In [32]:
new_df.rdd.getNumPartitions()

1

In [33]:
df4 = new_df.repartition(20)

In [34]:
df4.rdd.getNumPartitions()

20

In [35]:
df4.write.\
format("csv").\
mode("overwrite").\
save("orders_file")

In [36]:
df5=spark.read.format("csv").schema(order_schema).load("orders_file")

In [37]:
df5.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|   17523|2013-11-11 00:00:...|       9277|         CLOSED|
|   17523|2013-11-11 00:00:...|       9277|         CLOSED|
|   42679|2014-04-14 00:00:...|       1479|        ON_HOLD|
|   42679|2014-04-14 00:00:...|       1479|        ON_HOLD|
|   33615|2014-02-17 00:00:...|       8581|       COMPLETE|
|   33615|2014-02-17 00:00:...|       8581|       COMPLETE|
|   32649|2014-02-11 00:00:...|       7926|PENDING_PAYMENT|
|   32649|2014-02-11 00:00:...|       7926|PENDING_PAYMENT|
|   32649|2014-02-11 00:00:...|       7926|PENDING_PAYMENT|
|    3520|2013-08-14 00:00:...|      12180|       COMPLETE|
|    3520|2013-08-14 00:00:...|      12180|       COMPLETE|
|   39976|2014-03-29 00:00:...|       7426|         CLOSED|
|   39976|2014-03-29 00:00:...|       7426|         CLOSED|
|   27250|2014-01-10 00:00:...|      100

In [38]:
df5.rdd.getNumPartitions()

10

In [39]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [40]:
spark.sparkContext.defaultParallelism

2

In [41]:
spark.conf.get("spark.sql.files.openCostInBytes") #the time it takes to open 1 file, it takes to read 4mb of data

'4194304'

In [42]:
4194304/(1024*1024) #MB 

4.0

In [43]:
df5 = new_df.repartition(500)

In [44]:
df5.rdd.getNumPartitions()

500

In [45]:
df5.write.\
format("csv").\
mode("overwrite").\
save("orders_final")

In [46]:
df6=spark.read.format("csv").schema(order_schema).load("orders_final")

In [48]:
df6.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|   39976|2014-03-29 00:00:...|       7426|         CLOSED|
|   50936|2014-06-09 00:00:...|       2611|       COMPLETE|
|    9107|2013-09-20 00:00:...|      11915|PENDING_PAYMENT|
|    2417|2013-08-07 00:00:...|        553|     PROCESSING|
|   49186|2014-05-27 00:00:...|       8763|       COMPLETE|
|   13123|2013-10-13 00:00:...|       4415|       COMPLETE|
|   12615|2013-10-10 00:00:...|       9578|PENDING_PAYMENT|
|   54410|2014-07-03 00:00:...|         38|     PROCESSING|
|   34912|2014-02-25 00:00:...|       7600|         CLOSED|
|   19329|2013-11-22 00:00:...|       4579|       COMPLETE|
|   65469|2014-05-13 00:00:...|       3776|       COMPLETE|
|   52771|2014-06-21 00:00:...|       4081|PENDING_PAYMENT|
|   10918|2013-09-30 00:00:...|       6337|       COMPLETE|
|   45373|2014-05-02 00:00:...|       70

In [49]:
df6.rdd.getNumPartitions()

24