In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.load("/public/trendytech/orders/orders_1gb.csv")

In [3]:
orders_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [4]:
orders_df.show()

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
|  4|2013-07-25 00:00:...| 8827|         CLOSED|
|  5|2013-07-25 00:00:...|11318|       COMPLETE|
|  6|2013-07-25 00:00:...| 7130|       COMPLETE|
|  7|2013-07-25 00:00:...| 4530|       COMPLETE|
|  8|2013-07-25 00:00:...| 2911|     PROCESSING|
|  9|2013-07-25 00:00:...| 5657|PENDING_PAYMENT|
| 10|2013-07-25 00:00:...| 5648|PENDING_PAYMENT|
| 11|2013-07-25 00:00:...|  918| PAYMENT_REVIEW|
| 12|2013-07-25 00:00:...| 1837|         CLOSED|
| 13|2013-07-25 00:00:...| 9149|PENDING_PAYMENT|
| 14|2013-07-25 00:00:...| 9842|     PROCESSING|
| 15|2013-07-25 00:00:...| 2568|       COMPLETE|
| 16|2013-07-25 00:00:...| 7276|PENDING_PAYMENT|
| 17|2013-07-25 00:00:...| 2667|       COMPLETE|
| 18|2013-07-25 00:0

In [5]:
orders_schema = "order_id long , order_date date, customer_id long,order_status string"

In [6]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [7]:
orders_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [8]:
orders_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [9]:
orders_df.rdd.getNumPartitions()

9

In [10]:
orders_json_df = spark.read \
.format("json") \
.load("/public/trendytech/datasets/json_sample_singleline")

In [11]:
orders_json_df.show()

+-----------+----------+--------+---------------+
|customer_id|order_date|order_id|   order_status|
+-----------+----------+--------+---------------+
|      11599|2013-07-25|       1|         CLOSED|
|        256|2013-07-25|       2|PENDING_PAYMENT|
|      12111|2013-07-25|       3|       COMPLETE|
|       8827|2013-07-25|       4|         CLOSED|
|      11318|2013-07-25|       5|       COMPLETE|
|       7130|2013-07-25|       6|       COMPLETE|
|       4530|2013-07-25|       7|       COMPLETE|
|       2911|2013-07-25|       8|     PROCESSING|
|       5657|2013-07-25|       9|PENDING_PAYMENT|
|       5648|2013-07-25|      10|PENDING_PAYMENT|
|        918|2013-07-25|      11| PAYMENT_REVIEW|
|       1837|2013-07-25|      12|         CLOSED|
|       9149|2013-07-25|      13|PENDING_PAYMENT|
|       9842|2013-07-25|      14|     PROCESSING|
|       2568|2013-07-25|      15|       COMPLETE|
|       7276|2013-07-25|      16|PENDING_PAYMENT|
|       2667|2013-07-25|      17|       COMPLETE|


In [12]:
orders_json_df.rdd.getNumPartitions()

18

In [13]:
orders_json_ml_df = spark.read \
.format("json") \
.option("multiLine", True)\
.load("/public/trendytech/datasets/json_sample_multiline")

In [14]:
orders_json_ml_df.show()

+-----------+----------+--------+---------------+
|customer_id|order_date|order_id|   order_status|
+-----------+----------+--------+---------------+
|      11599|2013-07-25|       1|         CLOSED|
|        256|2013-07-25|       2|PENDING_PAYMENT|
|      12111|2013-07-25|       3|       COMPLETE|
|       8827|2013-07-25|       4|         CLOSED|
|      11318|2013-07-25|       5|       COMPLETE|
|       7130|2013-07-25|       6|       COMPLETE|
|       4530|2013-07-25|       7|       COMPLETE|
|       2911|2013-07-25|       8|     PROCESSING|
|       5657|2013-07-25|       9|PENDING_PAYMENT|
|       5648|2013-07-25|      10|PENDING_PAYMENT|
|        918|2013-07-25|      11| PAYMENT_REVIEW|
|       1837|2013-07-25|      12|         CLOSED|
|       9149|2013-07-25|      13|PENDING_PAYMENT|
|       9842|2013-07-25|      14|     PROCESSING|
|       2568|2013-07-25|      15|       COMPLETE|
|       7276|2013-07-25|      16|PENDING_PAYMENT|
|       2667|2013-07-25|      17|       COMPLETE|


In [15]:
orders_json_ml_df.rdd.getNumPartitions()

1

In [16]:
spark.conf.get('spark.sql.files.maxPartitionBytes')

'134217728b'

In [17]:
spark.conf.set('spark.sql.files.maxPartitionBytes', '1342177b')

In [18]:
1342177/(1024 * 1024)

1.2799997329711914

In [19]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [20]:
orders_df.rdd.getNumPartitions()

839

In [21]:
orders_json_ml_df = spark.read \
.format("json") \
.option("multiLine", True)\
.load("/public/trendytech/datasets/json_sample_multiline")

In [22]:
orders_json_ml_df.rdd.getNumPartitions()

1