In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("df writer apis")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [3]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '

In [4]:
df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [5]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [8]:
df.rdd.getNumPartitions()

9

In [11]:
!hdfs dfs -du -h /public/trendytech/orders/

3.5 G    10.5 G  /public/trendytech/orders/orders.csv
1.0 G    3.1 G   /public/trendytech/orders/orders_1gb.csv
370.7 M  1.1 G   /public/trendytech/orders/ordersparquet


In [12]:
df.write.format("csv").mode("overwrite").option("path","/user/itv009490/spark_writer_demo1").save()

In [13]:
!hdfs dfs -ls /user/itv009490/spark_writer_demo1

Found 10 items
-rw-r--r--   3 itv009490 supergroup          0 2023-12-01 13:11 /user/itv009490/spark_writer_demo1/_SUCCESS
-rw-r--r--   3 itv009490 supergroup  134217756 2023-12-01 13:11 /user/itv009490/spark_writer_demo1/part-00000-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
-rw-r--r--   3 itv009490 supergroup  134217738 2023-12-01 13:10 /user/itv009490/spark_writer_demo1/part-00001-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
-rw-r--r--   3 itv009490 supergroup  134217735 2023-12-01 13:11 /user/itv009490/spark_writer_demo1/part-00002-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
-rw-r--r--   3 itv009490 supergroup  134217692 2023-12-01 13:11 /user/itv009490/spark_writer_demo1/part-00003-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
-rw-r--r--   3 itv009490 supergroup  134217743 2023-12-01 13:11 /user/itv009490/spark_writer_demo1/part-00004-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
-rw-r--r--   3 itv009490 supergroup  134217746 2023-12-01 13:11 /user/itv009490/spark_writer_demo

In [14]:
!hdfs dfs -du -h /user/itv009490/spark_writer_demo1

0        0        /user/itv009490/spark_writer_demo1/_SUCCESS
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00000-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00001-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00002-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00003-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00004-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00005-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00006-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
128.0 M  384.0 M  /user/itv009490/spark_writer_demo1/part-00007-b4a0078e-390c-4146-9808-7935d56d9ac1-c000.csv
48.9 M   146.6 M  /user/itv009490/spark_writer_demo1/part-

In [17]:
#default format

df.write.format("orc").mode("append").option("path","/user/itv009490/spark_writer_demo1").save()

In [19]:
!hdfs dfs -ls -h /user/itv009490/spark_writer_demo1

Found 19 items
-rw-r--r--   3 itv009490 supergroup          0 2023-12-01 13:22 /user/itv009490/spark_writer_demo1/_SUCCESS
-rw-r--r--   3 itv009490 supergroup      7.0 M 2023-12-01 13:22 /user/itv009490/spark_writer_demo1/part-00000-a67238e8-c20c-4bd7-bf56-401130da3f1b-c000.snappy.orc
-rw-r--r--   3 itv009490 supergroup     13.4 M 2023-12-01 13:15 /user/itv009490/spark_writer_demo1/part-00000-f26f3edb-6922-4072-b7ac-654def9483d9-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup      7.0 M 2023-12-01 13:22 /user/itv009490/spark_writer_demo1/part-00001-a67238e8-c20c-4bd7-bf56-401130da3f1b-c000.snappy.orc
-rw-r--r--   3 itv009490 supergroup     13.4 M 2023-12-01 13:15 /user/itv009490/spark_writer_demo1/part-00001-f26f3edb-6922-4072-b7ac-654def9483d9-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup      7.0 M 2023-12-01 13:22 /user/itv009490/spark_writer_demo1/part-00002-a67238e8-c20c-4bd7-bf56-401130da3f1b-c000.snappy.orc
-rw-r--r--   3 itv009490 supergroup     13.4 M 2023-12